diff options
author | Josh Wilsdon <jwilsdon@joyent.com> | 2011-04-06 16:35:09 -0700 |
---|---|---|
committer | Josh Wilsdon <jwilsdon@joyent.com> | 2011-04-06 16:35:09 -0700 |
commit | c5e99aab98c3a8ddb8e0e2953c1a3e534d67ca4f (patch) | |
tree | 9beb629b7abbb44c7385607ac2b6f842ca4ac18b | |
parent | 8ddb3439b2f21ccfb06e6d0b4a8a1fb0f944cf07 (diff) | |
download | illumos-kvm-c5e99aab98c3a8ddb8e0e2953c1a3e534d67ca4f.tar.gz |
[HVM-29] remove silly .gitignore that was hiding files from the previous commit.
56 files changed, 42251 insertions, 74 deletions
diff --git a/linux/.gitignore b/linux/.gitignore deleted file mode 100644 index 38bff83..0000000 --- a/linux/.gitignore +++ /dev/null @@ -1,74 +0,0 @@ -*.o -*.d -*~ -*.flat -*.a -.*.cmd -*.ko -*.mod.c -config.mak -kvm-kmod-config.h -modules.order -Module.symvers -Modules.symvers -Module.markers -.tmp_versions -.tmp.kvm-kmod.* -include-compat/asm -include -x86/modules.order -x86/i825[49].[ch] -x86/kvm_main.c -x86/kvm_svm.h -x86/vmx.[ch] -x86/svm.[ch] -x86/mmu.[ch] -x86/paging_tmpl.h -x86/ioapic.[ch] -x86/iodev.h -x86/irq.[ch] -x86/lapic.[ch] -x86/tss.h -x86/x86.[ch] -x86/coalesced_mmio.[ch] -x86/kvm_cache_regs.h -x86/irq_comm.c -x86/timer.c -x86/kvm_timer.h -x86/iommu.c -x86/svm-trace.h -x86/trace-arch.h -x86/trace.h -x86/vmx-trace.h -x86/assigned-dev.c -x86/emulate.c -x86/eventfd.c -x86/mmutrace.h -ia64/asm-offsets.c -ia64/coalesced_mmio.[ch] -ia64/ioapic.[ch] -ia64/iodev.h -ia64/iommu.c -ia64/irq.h -ia64/irq_comm.c -ia64/kvm-ia64.c -ia64/kvm_fw.c -ia64/kvm_lib.c -ia64/kvm_main.c -ia64/kvm_minstate.h -ia64/lapic.h -ia64/memcpy.S -ia64/memset.S -ia64/misc.h -ia64/mmio.c -ia64/optvfault.S -ia64/process.c -ia64/trampoline.S -ia64/vcpu.[ch] -ia64/vmm.c -ia64/vmm_ivt.S -ia64/vti.h -ia64/vtlb.c -ia64/assigned-dev.c -ia64/eventfd.c -.stgit-* diff --git a/linux/include/asm-ia64/kvm.h b/linux/include/asm-ia64/kvm.h new file mode 100644 index 0000000..ce31fac --- /dev/null +++ b/linux/include/asm-ia64/kvm.h @@ -0,0 +1,304 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#ifndef __ASM_IA64_KVM_H +#define __ASM_IA64_KVM_H + +/* + * kvm structure definitions for ia64 + * + * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include <asm/types.h> +#include <linux/ioctl.h> + +/* Select x86 specific features in <linux/kvm.h> */ +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT + +/* Architectural interrupt line count. */ +#define KVM_NR_INTERRUPTS 256 + +#define KVM_IOAPIC_NUM_PINS 48 + +struct kvm_ioapic_state { + __u64 base_address; + __u32 ioregsel; + __u32 id; + __u32 irr; + __u32 pad; + union { + __u64 bits; + struct { + __u8 vector; + __u8 delivery_mode:3; + __u8 dest_mode:1; + __u8 delivery_status:1; + __u8 polarity:1; + __u8 remote_irr:1; + __u8 trig_mode:1; + __u8 mask:1; + __u8 reserve:7; + __u8 reserved[4]; + __u8 dest_id; + } fields; + } redirtbl[KVM_IOAPIC_NUM_PINS]; +}; + +#define KVM_IRQCHIP_PIC_MASTER 0 +#define KVM_IRQCHIP_PIC_SLAVE 1 +#define KVM_IRQCHIP_IOAPIC 2 +#define KVM_NR_IRQCHIPS 3 + +#define KVM_CONTEXT_SIZE 8*1024 + +struct kvm_fpreg { + union { + unsigned long bits[2]; + long double __dummy; /* force 16-byte alignment */ + } u; +}; + +union context { + /* 8K size */ + char dummy[KVM_CONTEXT_SIZE]; + struct { + unsigned long psr; + unsigned long pr; + unsigned long caller_unat; + unsigned long pad; + unsigned long gr[32]; + unsigned long ar[128]; + unsigned long br[8]; + unsigned long cr[128]; + unsigned long rr[8]; + unsigned long ibr[8]; + unsigned long dbr[8]; + unsigned long pkr[8]; + struct kvm_fpreg fr[128]; + }; +}; + +struct thash_data { + union { + struct { + unsigned long p : 1; /* 0 */ + unsigned long rv1 : 1; /* 1 */ + unsigned long ma : 3; /* 2-4 */ + unsigned long a : 1; /* 5 */ + unsigned long d : 1; /* 6 */ + unsigned long pl : 2; /* 7-8 */ + unsigned long ar : 3; /* 9-11 */ + unsigned long ppn : 38; /* 12-49 */ + unsigned long rv2 : 2; /* 50-51 */ + unsigned long ed : 1; /* 52 */ + unsigned long ig1 : 11; /* 53-63 */ + }; + struct { + unsigned long __rv1 : 53; /* 0-52 */ + unsigned long contiguous : 1; /*53 */ + unsigned long tc : 1; /* 54 TR or TC */ + unsigned long cl : 1; + /* 55 I side or D side cache line */ + unsigned long len : 4; /* 56-59 */ + unsigned long io : 1; /* 60 entry is for io or not */ + unsigned long nomap : 1; + /* 61 entry cann't be inserted into machine TLB.*/ + unsigned long checked : 1; + /* 62 for VTLB/VHPT sanity check */ + unsigned long invalid : 1; + /* 63 invalid entry */ + }; + unsigned long page_flags; + }; /* same for VHPT and TLB */ + + union { + struct { + unsigned long rv3 : 2; + unsigned long ps : 6; + unsigned long key : 24; + unsigned long rv4 : 32; + }; + unsigned long itir; + }; + union { + struct { + unsigned long ig2 : 12; + unsigned long vpn : 49; + unsigned long vrn : 3; + }; + unsigned long ifa; + unsigned long vadr; + struct { + unsigned long tag : 63; + unsigned long ti : 1; + }; + unsigned long etag; + }; + union { + struct thash_data *next; + unsigned long rid; + unsigned long gpaddr; + }; +}; + +#define NITRS 8 +#define NDTRS 8 + +struct saved_vpd { + unsigned long vhpi; + unsigned long vgr[16]; + unsigned long vbgr[16]; + unsigned long vnat; + unsigned long vbnat; + unsigned long vcpuid[5]; + unsigned long vpsr; + unsigned long vpr; + union { + unsigned long vcr[128]; + struct { + unsigned long dcr; + unsigned long itm; + unsigned long iva; + unsigned long rsv1[5]; + unsigned long pta; + unsigned long rsv2[7]; + unsigned long ipsr; + unsigned long isr; + unsigned long rsv3; + unsigned long iip; + unsigned long ifa; + unsigned long itir; + unsigned long iipa; + unsigned long ifs; + unsigned long iim; + unsigned long iha; + unsigned long rsv4[38]; + unsigned long lid; + unsigned long ivr; + unsigned long tpr; + unsigned long eoi; + unsigned long irr[4]; + unsigned long itv; + unsigned long pmv; + unsigned long cmcv; + unsigned long rsv5[5]; + unsigned long lrr0; + unsigned long lrr1; + unsigned long rsv6[46]; + }; + }; +}; + +struct kvm_regs { + struct saved_vpd vpd; + /*Arch-regs*/ + int mp_state; + unsigned long vmm_rr; + /* TR and TC. */ + struct thash_data itrs[NITRS]; + struct thash_data dtrs[NDTRS]; + /* Bit is set if there is a tr/tc for the region. */ + unsigned char itr_regions; + unsigned char dtr_regions; + unsigned char tc_regions; + + char irq_check; + unsigned long saved_itc; + unsigned long itc_check; + unsigned long timer_check; + unsigned long timer_pending; + unsigned long last_itc; + + unsigned long vrr[8]; + unsigned long ibr[8]; + unsigned long dbr[8]; + unsigned long insvc[4]; /* Interrupt in service. */ + unsigned long xtp; + + unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */ + unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */ + unsigned long metaphysical_saved_rr0; /* from kvm_arch */ + unsigned long metaphysical_saved_rr4; /* from kvm_arch */ + unsigned long fp_psr; /*used for lazy float register */ + unsigned long saved_gp; + /*for phycial emulation */ + + union context saved_guest; + + unsigned long reserved[64]; /* for future use */ +}; + +struct kvm_sregs { +}; + +struct kvm_fpu { +}; + +#define KVM_IA64_VCPU_STACK_SHIFT 16 +#define KVM_IA64_VCPU_STACK_SIZE (1UL << KVM_IA64_VCPU_STACK_SHIFT) + +struct kvm_ia64_vcpu_stack { + unsigned char stack[KVM_IA64_VCPU_STACK_SIZE]; +}; + +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + +#endif diff --git a/linux/include/asm-ia64/kvm_host.h b/linux/include/asm-ia64/kvm_host.h new file mode 100644 index 0000000..91e5de5 --- /dev/null +++ b/linux/include/asm-ia64/kvm_host.h @@ -0,0 +1,639 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * kvm_host.h: used for kvm module, and hold ia64-specific sections. + * + * Copyright (C) 2007, Intel Corporation. + * + * Xiantao Zhang <xiantao.zhang@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#ifndef __ASM_KVM_HOST_H +#define __ASM_KVM_HOST_H + +#define KVM_MEMORY_SLOTS 32 +/* memory slots that does not exposed to userspace */ +#define KVM_PRIVATE_MEM_SLOTS 4 + +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + +/* define exit reasons from vmm to kvm*/ +#define EXIT_REASON_VM_PANIC 0 +#define EXIT_REASON_MMIO_INSTRUCTION 1 +#define EXIT_REASON_PAL_CALL 2 +#define EXIT_REASON_SAL_CALL 3 +#define EXIT_REASON_SWITCH_RR6 4 +#define EXIT_REASON_VM_DESTROY 5 +#define EXIT_REASON_EXTERNAL_INTERRUPT 6 +#define EXIT_REASON_IPI 7 +#define EXIT_REASON_PTC_G 8 +#define EXIT_REASON_DEBUG 20 + +/*Define vmm address space and vm data space.*/ +#define KVM_VMM_SIZE (__IA64_UL_CONST(16)<<20) +#define KVM_VMM_SHIFT 24 +#define KVM_VMM_BASE 0xD000000000000000 +#define VMM_SIZE (__IA64_UL_CONST(8)<<20) + +/* + * Define vm_buffer, used by PAL Services, base address. + * Note: vm_buffer is in the VMM-BLOCK, the size must be < 8M + */ +#define KVM_VM_BUFFER_BASE (KVM_VMM_BASE + VMM_SIZE) +#define KVM_VM_BUFFER_SIZE (__IA64_UL_CONST(8)<<20) + +/* + * kvm guest's data area looks as follow: + * + * +----------------------+ ------- KVM_VM_DATA_SIZE + * | vcpu[n]'s data | | ___________________KVM_STK_OFFSET + * | | | / | + * | .......... | | /vcpu's struct&stack | + * | .......... | | /---------------------|---- 0 + * | vcpu[5]'s data | | / vpd | + * | vcpu[4]'s data | |/-----------------------| + * | vcpu[3]'s data | / vtlb | + * | vcpu[2]'s data | /|------------------------| + * | vcpu[1]'s data |/ | vhpt | + * | vcpu[0]'s data |____________________________| + * +----------------------+ | + * | memory dirty log | | + * +----------------------+ | + * | vm's data struct | | + * +----------------------+ | + * | | | + * | | | + * | | | + * | | | + * | | | + * | | | + * | | | + * | vm's p2m table | | + * | | | + * | | | + * | | | | + * vm's data->| | | | + * +----------------------+ ------- 0 + * To support large memory, needs to increase the size of p2m. + * To support more vcpus, needs to ensure it has enough space to + * hold vcpus' data. + */ + +#define KVM_VM_DATA_SHIFT 26 +#define KVM_VM_DATA_SIZE (__IA64_UL_CONST(1) << KVM_VM_DATA_SHIFT) +#define KVM_VM_DATA_BASE (KVM_VMM_BASE + KVM_VM_DATA_SIZE) + +#define KVM_P2M_BASE KVM_VM_DATA_BASE +#define KVM_P2M_SIZE (__IA64_UL_CONST(24) << 20) + +#define VHPT_SHIFT 16 +#define VHPT_SIZE (__IA64_UL_CONST(1) << VHPT_SHIFT) +#define VHPT_NUM_ENTRIES (__IA64_UL_CONST(1) << (VHPT_SHIFT-5)) + +#define VTLB_SHIFT 16 +#define VTLB_SIZE (__IA64_UL_CONST(1) << VTLB_SHIFT) +#define VTLB_NUM_ENTRIES (1UL << (VHPT_SHIFT-5)) + +#define VPD_SHIFT 16 +#define VPD_SIZE (__IA64_UL_CONST(1) << VPD_SHIFT) + +#define VCPU_STRUCT_SHIFT 16 +#define VCPU_STRUCT_SIZE (__IA64_UL_CONST(1) << VCPU_STRUCT_SHIFT) + +/* + * This must match KVM_IA64_VCPU_STACK_{SHIFT,SIZE} arch/ia64/include/asm/kvm.h + */ +#define KVM_STK_SHIFT 16 +#define KVM_STK_OFFSET (__IA64_UL_CONST(1)<< KVM_STK_SHIFT) + +#define KVM_VM_STRUCT_SHIFT 19 +#define KVM_VM_STRUCT_SIZE (__IA64_UL_CONST(1) << KVM_VM_STRUCT_SHIFT) + +#define KVM_MEM_DIRY_LOG_SHIFT 19 +#define KVM_MEM_DIRTY_LOG_SIZE (__IA64_UL_CONST(1) << KVM_MEM_DIRY_LOG_SHIFT) + +#ifndef __ASSEMBLY__ + +/*Define the max vcpus and memory for Guests.*/ +#define KVM_MAX_VCPUS (KVM_VM_DATA_SIZE - KVM_P2M_SIZE - KVM_VM_STRUCT_SIZE -\ + KVM_MEM_DIRTY_LOG_SIZE) / sizeof(struct kvm_vcpu_data) +#define KVM_MAX_MEM_SIZE (KVM_P2M_SIZE >> 3 << PAGE_SHIFT) + +#define VMM_LOG_LEN 256 + +#include <asm/types.h> +#include <linux/mm.h> +#include <linux/kvm.h> +#include <linux/kvm_para.h> +#include <linux/kvm_types.h> + +#include <asm/pal.h> +#include <asm/sal.h> +#include <asm/page.h> + +struct kvm_vcpu_data { + char vcpu_vhpt[VHPT_SIZE]; + char vcpu_vtlb[VTLB_SIZE]; + char vcpu_vpd[VPD_SIZE]; + char vcpu_struct[VCPU_STRUCT_SIZE]; +}; + +struct kvm_vm_data { + char kvm_p2m[KVM_P2M_SIZE]; + char kvm_vm_struct[KVM_VM_STRUCT_SIZE]; + char kvm_mem_dirty_log[KVM_MEM_DIRTY_LOG_SIZE]; + struct kvm_vcpu_data vcpu_data[KVM_MAX_VCPUS]; +}; + +#define VCPU_BASE(n) (KVM_VM_DATA_BASE + \ + offsetof(struct kvm_vm_data, vcpu_data[n])) +#define KVM_VM_BASE (KVM_VM_DATA_BASE + \ + offsetof(struct kvm_vm_data, kvm_vm_struct)) +#define KVM_MEM_DIRTY_LOG_BASE KVM_VM_DATA_BASE + \ + offsetof(struct kvm_vm_data, kvm_mem_dirty_log) + +#define VHPT_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vhpt)) +#define VTLB_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vtlb)) +#define VPD_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vpd)) +#define VCPU_STRUCT_BASE(n) (VCPU_BASE(n) + \ + offsetof(struct kvm_vcpu_data, vcpu_struct)) + +/*IO section definitions*/ +#define IOREQ_READ 1 +#define IOREQ_WRITE 0 + +#define STATE_IOREQ_NONE 0 +#define STATE_IOREQ_READY 1 +#define STATE_IOREQ_INPROCESS 2 +#define STATE_IORESP_READY 3 + +/*Guest Physical address layout.*/ +#define GPFN_MEM (0UL << 60) /* Guest pfn is normal mem */ +#define GPFN_FRAME_BUFFER (1UL << 60) /* VGA framebuffer */ +#define GPFN_LOW_MMIO (2UL << 60) /* Low MMIO range */ +#define GPFN_PIB (3UL << 60) /* PIB base */ +#define GPFN_IOSAPIC (4UL << 60) /* IOSAPIC base */ +#define GPFN_LEGACY_IO (5UL << 60) /* Legacy I/O base */ +#define GPFN_GFW (6UL << 60) /* Guest Firmware */ +#define GPFN_PHYS_MMIO (7UL << 60) /* Directed MMIO Range */ + +#define GPFN_IO_MASK (7UL << 60) /* Guest pfn is I/O type */ +#define GPFN_INV_MASK (1UL << 63) /* Guest pfn is invalid */ +#define INVALID_MFN (~0UL) +#define MEM_G (1UL << 30) +#define MEM_M (1UL << 20) +#define MMIO_START (3 * MEM_G) +#define MMIO_SIZE (512 * MEM_M) +#define VGA_IO_START 0xA0000UL +#define VGA_IO_SIZE 0x20000 +#define LEGACY_IO_START (MMIO_START + MMIO_SIZE) +#define LEGACY_IO_SIZE (64 * MEM_M) +#define IO_SAPIC_START 0xfec00000UL +#define IO_SAPIC_SIZE 0x100000 +#define PIB_START 0xfee00000UL +#define PIB_SIZE 0x200000 +#define GFW_START (4 * MEM_G - 16 * MEM_M) +#define GFW_SIZE (16 * MEM_M) + +/*Deliver mode, defined for ioapic.c*/ +#define dest_Fixed IOSAPIC_FIXED +#define dest_LowestPrio IOSAPIC_LOWEST_PRIORITY + +#define NMI_VECTOR 2 +#define ExtINT_VECTOR 0 +#define NULL_VECTOR (-1) +#define IA64_SPURIOUS_INT_VECTOR 0x0f + +#define VCPU_LID(v) (((u64)(v)->vcpu_id) << 24) + +/* + *Delivery mode + */ +#define SAPIC_DELIV_SHIFT 8 +#define SAPIC_FIXED 0x0 +#define SAPIC_LOWEST_PRIORITY 0x1 +#define SAPIC_PMI 0x2 +#define SAPIC_NMI 0x4 +#define SAPIC_INIT 0x5 +#define SAPIC_EXTINT 0x7 + +/* + * vcpu->requests bit members for arch + */ +#define KVM_REQ_PTC_G 32 +#define KVM_REQ_RESUME 33 + +#define KVM_HPAGE_GFN_SHIFT(x) 0 +#define KVM_NR_PAGE_SIZES 1 +#define KVM_PAGES_PER_HPAGE(x) 1 + +struct kvm; +struct kvm_vcpu; + +struct kvm_mmio_req { + uint64_t addr; /* physical address */ + uint64_t size; /* size in bytes */ + uint64_t data; /* data (or paddr of data) */ + uint8_t state:4; + uint8_t dir:1; /* 1=read, 0=write */ +}; + +/*Pal data struct */ +struct kvm_pal_call{ + /*In area*/ + uint64_t gr28; + uint64_t gr29; + uint64_t gr30; + uint64_t gr31; + /*Out area*/ + struct ia64_pal_retval ret; +}; + +/* Sal data structure */ +struct kvm_sal_call{ + /*In area*/ + uint64_t in0; + uint64_t in1; + uint64_t in2; + uint64_t in3; + uint64_t in4; + uint64_t in5; + uint64_t in6; + uint64_t in7; + struct sal_ret_values ret; +}; + +/*Guest change rr6*/ +struct kvm_switch_rr6 { + uint64_t old_rr; + uint64_t new_rr; +}; + +union ia64_ipi_a{ + unsigned long val; + struct { + unsigned long rv : 3; + unsigned long ir : 1; + unsigned long eid : 8; + unsigned long id : 8; + unsigned long ib_base : 44; + }; +}; + +union ia64_ipi_d { + unsigned long val; + struct { + unsigned long vector : 8; + unsigned long dm : 3; + unsigned long ig : 53; + }; +}; + +/*ipi check exit data*/ +struct kvm_ipi_data{ + union ia64_ipi_a addr; + union ia64_ipi_d data; +}; + +/*global purge data*/ +struct kvm_ptc_g { + unsigned long vaddr; + unsigned long rr; + unsigned long ps; + struct kvm_vcpu *vcpu; +}; + +/*Exit control data */ +struct exit_ctl_data{ + uint32_t exit_reason; + uint32_t vm_status; + union { + struct kvm_mmio_req ioreq; + struct kvm_pal_call pal_data; + struct kvm_sal_call sal_data; + struct kvm_switch_rr6 rr_data; + struct kvm_ipi_data ipi_data; + struct kvm_ptc_g ptc_g_data; + } u; +}; + +union pte_flags { + unsigned long val; + struct { + unsigned long p : 1; /*0 */ + unsigned long : 1; /* 1 */ + unsigned long ma : 3; /* 2-4 */ + unsigned long a : 1; /* 5 */ + unsigned long d : 1; /* 6 */ + unsigned long pl : 2; /* 7-8 */ + unsigned long ar : 3; /* 9-11 */ + unsigned long ppn : 38; /* 12-49 */ + unsigned long : 2; /* 50-51 */ + unsigned long ed : 1; /* 52 */ + }; +}; + +union ia64_pta { + unsigned long val; + struct { + unsigned long ve : 1; + unsigned long reserved0 : 1; + unsigned long size : 6; + unsigned long vf : 1; + unsigned long reserved1 : 6; + unsigned long base : 49; + }; +}; + +struct thash_cb { + /* THASH base information */ + struct thash_data *hash; /* hash table pointer */ + union ia64_pta pta; + int num; +}; + +struct kvm_vcpu_stat { +}; + +struct kvm_vcpu_arch { + int launched; + int last_exit; + int last_run_cpu; + int vmm_tr_slot; + int vm_tr_slot; + int sn_rtc_tr_slot; + +#define KVM_MP_STATE_RUNNABLE 0 +#define KVM_MP_STATE_UNINITIALIZED 1 +#define KVM_MP_STATE_INIT_RECEIVED 2 +#define KVM_MP_STATE_HALTED 3 + int mp_state; + +#define MAX_PTC_G_NUM 3 + int ptc_g_count; + struct kvm_ptc_g ptc_g_data[MAX_PTC_G_NUM]; + + /*halt timer to wake up sleepy vcpus*/ + struct hrtimer hlt_timer; + long ht_active; + + struct kvm_lapic *apic; /* kernel irqchip context */ + struct vpd *vpd; + + /* Exit data for vmm_transition*/ + struct exit_ctl_data exit_data; + + cpumask_t cache_coherent_map; + + unsigned long vmm_rr; + unsigned long host_rr6; + unsigned long psbits[8]; + unsigned long cr_iipa; + unsigned long cr_isr; + unsigned long vsa_base; + unsigned long dirty_log_lock_pa; + unsigned long __gp; + /* TR and TC. */ + struct thash_data itrs[NITRS]; + struct thash_data dtrs[NDTRS]; + /* Bit is set if there is a tr/tc for the region. */ + unsigned char itr_regions; + unsigned char dtr_regions; + unsigned char tc_regions; + /* purge all */ + unsigned long ptce_base; + unsigned long ptce_count[2]; + unsigned long ptce_stride[2]; + /* itc/itm */ + unsigned long last_itc; + long itc_offset; + unsigned long itc_check; + unsigned long timer_check; + unsigned int timer_pending; + unsigned int timer_fired; + + unsigned long vrr[8]; + unsigned long ibr[8]; + unsigned long dbr[8]; + unsigned long insvc[4]; /* Interrupt in service. */ + unsigned long xtp; + + unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */ + unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */ + unsigned long metaphysical_saved_rr0; /* from kvm_arch */ + unsigned long metaphysical_saved_rr4; /* from kvm_arch */ + unsigned long fp_psr; /*used for lazy float register */ + unsigned long saved_gp; + /*for phycial emulation */ + int mode_flags; + struct thash_cb vtlb; + struct thash_cb vhpt; + char irq_check; + char irq_new_pending; + + unsigned long opcode; + unsigned long cause; + char log_buf[VMM_LOG_LEN]; + union context host; + union context guest; +}; + +struct kvm_vm_stat { + u64 remote_tlb_flush; +}; + +struct kvm_sal_data { + unsigned long boot_ip; + unsigned long boot_gp; +}; + +struct kvm_arch { + spinlock_t dirty_log_lock; + + unsigned long vm_base; + unsigned long metaphysical_rr0; + unsigned long metaphysical_rr4; + unsigned long vmm_init_rr; + + int is_sn2; + + struct kvm_ioapic *vioapic; + struct kvm_vm_stat stat; + struct kvm_sal_data rdv_sal_data; + + struct list_head assigned_dev_head; + struct iommu_domain *iommu_domain; + int iommu_flags; + + unsigned long irq_sources_bitmap; + unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; +}; + +union cpuid3_t { + u64 value; + struct { + u64 number : 8; + u64 revision : 8; + u64 model : 8; + u64 family : 8; + u64 archrev : 8; + u64 rv : 24; + }; +}; + +struct kvm_pt_regs { + /* The following registers are saved by SAVE_MIN: */ + unsigned long b6; /* scratch */ + unsigned long b7; /* scratch */ + + unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */ + unsigned long ar_ssd; /* reserved for future use (scratch) */ + + unsigned long r8; /* scratch (return value register 0) */ + unsigned long r9; /* scratch (return value register 1) */ + unsigned long r10; /* scratch (return value register 2) */ + unsigned long r11; /* scratch (return value register 3) */ + + unsigned long cr_ipsr; /* interrupted task's psr */ + unsigned long cr_iip; /* interrupted task's instruction pointer */ + unsigned long cr_ifs; /* interrupted task's function state */ + + unsigned long ar_unat; /* interrupted task's NaT register (preserved) */ + unsigned long ar_pfs; /* prev function state */ + unsigned long ar_rsc; /* RSE configuration */ + /* The following two are valid only if cr_ipsr.cpl > 0: */ + unsigned long ar_rnat; /* RSE NaT */ + unsigned long ar_bspstore; /* RSE bspstore */ + + unsigned long pr; /* 64 predicate registers (1 bit each) */ + unsigned long b0; /* return pointer (bp) */ + unsigned long loadrs; /* size of dirty partition << 16 */ + + unsigned long r1; /* the gp pointer */ + unsigned long r12; /* interrupted task's memory stack pointer */ + unsigned long r13; /* thread pointer */ + + unsigned long ar_fpsr; /* floating point status (preserved) */ + unsigned long r15; /* scratch */ + + /* The remaining registers are NOT saved for system calls. */ + unsigned long r14; /* scratch */ + unsigned long r2; /* scratch */ + unsigned long r3; /* scratch */ + unsigned long r16; /* scratch */ + unsigned long r17; /* scratch */ + unsigned long r18; /* scratch */ + unsigned long r19; /* scratch */ + unsigned long r20; /* scratch */ + unsigned long r21; /* scratch */ + unsigned long r22; /* scratch */ + unsigned long r23; /* scratch */ + unsigned long r24; /* scratch */ + unsigned long r25; /* scratch */ + unsigned long r26; /* scratch */ + unsigned long r27; /* scratch */ + unsigned long r28; /* scratch */ + unsigned long r29; /* scratch */ + unsigned long r30; /* scratch */ + unsigned long r31; /* scratch */ + unsigned long ar_ccv; /* compare/exchange value (scratch) */ + + /* + * Floating point registers that the kernel considers scratch: + */ + struct ia64_fpreg f6; /* scratch */ + struct ia64_fpreg f7; /* scratch */ + struct ia64_fpreg f8; /* scratch */ + struct ia64_fpreg f9; /* scratch */ + struct ia64_fpreg f10; /* scratch */ + struct ia64_fpreg f11; /* scratch */ + + unsigned long r4; /* preserved */ + unsigned long r5; /* preserved */ + unsigned long r6; /* preserved */ + unsigned long r7; /* preserved */ + unsigned long eml_unat; /* used for emulating instruction */ + unsigned long pad0; /* alignment pad */ +}; + +static inline struct kvm_pt_regs *vcpu_regs(struct kvm_vcpu *v) +{ + return (struct kvm_pt_regs *) ((unsigned long) v + KVM_STK_OFFSET) - 1; +} + +typedef int kvm_vmm_entry(void); +typedef void kvm_tramp_entry(union context *host, union context *guest); + +struct kvm_vmm_info{ + struct module *module; + kvm_vmm_entry *vmm_entry; + kvm_tramp_entry *tramp_entry; + unsigned long vmm_ivt; + unsigned long patch_mov_ar; + unsigned long patch_mov_ar_sn2; +}; + +int kvm_highest_pending_irq(struct kvm_vcpu *vcpu); +int kvm_emulate_halt(struct kvm_vcpu *vcpu); +int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); +void kvm_sal_emul(struct kvm_vcpu *vcpu); + +#define __KVM_HAVE_ARCH_VM_ALLOC 1 +struct kvm *kvm_arch_alloc_vm(void); +void kvm_arch_free_vm(struct kvm *kvm); + +#endif /* __ASSEMBLY__*/ + +#endif diff --git a/linux/include/asm-ia64/kvm_para.h b/linux/include/asm-ia64/kvm_para.h new file mode 100644 index 0000000..2e2f499 --- /dev/null +++ b/linux/include/asm-ia64/kvm_para.h @@ -0,0 +1,71 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#ifndef __IA64_KVM_PARA_H +#define __IA64_KVM_PARA_H + +/* + * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#ifdef __KERNEL__ + +static inline unsigned int kvm_arch_para_features(void) +{ + return 0; +} + +#endif + +#endif diff --git a/linux/include/asm-x86/hyperv.h b/linux/include/asm-x86/hyperv.h new file mode 100644 index 0000000..8b44b46 --- /dev/null +++ b/linux/include/asm-x86/hyperv.h @@ -0,0 +1,233 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#ifndef _ASM_X86_HYPERV_H +#define _ASM_X86_HYPERV_H + +#include <linux/types.h> + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). + */ +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 + +#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 +#define HYPERV_CPUID_MIN 0x40000005 +#define HYPERV_CPUID_MAX 0x4000ffff + +/* + * Feature identification. EAX indicates which features are available + * to the partition based upon the current partition privileges. + */ + +/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) +/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ +#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* + * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM + * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available + */ +#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) +/* + * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through + * HV_X64_MSR_STIMER3_COUNT) available + */ +#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3) +/* + * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) + * are available + */ +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) +/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ +#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) +/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ +#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) +/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ +#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) + /* + * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, + * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available + */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) + +/* + * Feature identification: EBX indicates which flags were specified at + * partition creation. The format is the same as the partition creation + * flag structure defined in section Partition Creation Flags. + */ +#define HV_X64_CREATE_PARTITIONS (1 << 0) +#define HV_X64_ACCESS_PARTITION_ID (1 << 1) +#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) +#define HV_X64_POST_MESSAGES (1 << 4) +#define HV_X64_SIGNAL_EVENTS (1 << 5) +#define HV_X64_CREATE_PORT (1 << 6) +#define HV_X64_CONNECT_PORT (1 << 7) +#define HV_X64_ACCESS_STATS (1 << 8) +#define HV_X64_DEBUGGING (1 << 11) +#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) +#define HV_X64_CONFIGURE_PROFILER (1 << 13) + +/* + * Feature identification. EDX indicates which miscellaneous features + * are available to the partition. + */ +/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ +#define HV_X64_MWAIT_AVAILABLE (1 << 0) +/* Guest debugging support is available */ +#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) +/* Performance Monitor support is available*/ +#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) +/* Support for physical CPU dynamic partitioning events is available*/ +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) +/* + * Support for passing hypercall input parameter block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) +/* Support for a virtual guest idle state is available */ +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) + +/* + * Implementation recommendations. Indicates which behaviors the hypervisor + * recommends the OS implement for optimal performance. + */ + /* + * Recommend using hypercall for address space switches rather + * than MOV to CR3 instruction + */ +#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +/* Recommend using hypercall for local TLB flushes rather + * than INVLPG or MOV to CR3 instructions */ +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) +/* + * Recommend using hypercall for remote TLB flushes rather + * than inter-processor interrupts + */ +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) +/* + * Recommend using MSRs for accessing APIC registers + * EOI, ICR and TPR rather than their memory-mapped counterparts + */ +#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) +/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ +#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) +/* + * Recommend using relaxed timing for this partition. If used, + * the VM should disable any watchdog timeouts that rely on the + * timely delivery of external interrupts + */ +#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) + +/* MSR used to identify the guest OS. */ +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 + +/* MSR used to setup pages used to communicate with the hypervisor. */ +#define HV_X64_MSR_HYPERCALL 0x40000001 + +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX 0x40000002 + +/* MSR used to read the per-partition time reference counter */ +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 + +/* Define the virtual APIC registers */ +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073 + +/* Define synthetic interrupt controller model specific registers. */ +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F + + +#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) + +/* Declare the various hypercall operations. */ +#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008 + +#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +#define HV_PROCESSOR_POWER_STATE_C0 0 +#define HV_PROCESSOR_POWER_STATE_C1 1 +#define HV_PROCESSOR_POWER_STATE_C2 2 +#define HV_PROCESSOR_POWER_STATE_C3 3 + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 + +#endif diff --git a/linux/include/asm-x86/kvm.h b/linux/include/asm-x86/kvm.h new file mode 100644 index 0000000..12ddb51 --- /dev/null +++ b/linux/include/asm-x86/kvm.h @@ -0,0 +1,364 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#ifndef _ASM_X86_KVM_H +#define _ASM_X86_KVM_H + +/* + * KVM x86 specific structures and definitions + * + */ + +#include <asm/types.h> +#include <linux/ioctl.h> + +/* Select x86 specific features in <linux/kvm.h> */ +#define __KVM_HAVE_PIT +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT +#define __KVM_HAVE_MSI +#define __KVM_HAVE_USER_NMI +#define __KVM_HAVE_GUEST_DEBUG +#define __KVM_HAVE_MSIX +#define __KVM_HAVE_MCE +#define __KVM_HAVE_PIT_STATE2 +#define __KVM_HAVE_XEN_HVM +#define __KVM_HAVE_VCPU_EVENTS +#define __KVM_HAVE_DEBUGREGS +#define __KVM_HAVE_XSAVE +#define __KVM_HAVE_XCRS + +/* Architectural interrupt line count. */ +#define KVM_NR_INTERRUPTS 256 + +struct kvm_memory_alias { + __u32 slot; /* this has a different namespace than memory slots */ + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 target_phys_addr; +}; + +/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ +struct kvm_pic_state { + __u8 last_irr; /* edge detection */ + __u8 irr; /* interrupt request register */ + __u8 imr; /* interrupt mask register */ + __u8 isr; /* interrupt service register */ + __u8 priority_add; /* highest irq priority */ + __u8 irq_base; + __u8 read_reg_select; + __u8 poll; + __u8 special_mask; + __u8 init_state; + __u8 auto_eoi; + __u8 rotate_on_auto_eoi; + __u8 special_fully_nested_mode; + __u8 init4; /* true if 4 byte init */ + __u8 elcr; /* PIIX edge/trigger selection */ + __u8 elcr_mask; +}; + +#define KVM_IOAPIC_NUM_PINS 24 +struct kvm_ioapic_state { + __u64 base_address; + __u32 ioregsel; + __u32 id; + __u32 irr; + __u32 pad; + union { + __u64 bits; + struct { + __u8 vector; + __u8 delivery_mode:3; + __u8 dest_mode:1; + __u8 delivery_status:1; + __u8 polarity:1; + __u8 remote_irr:1; + __u8 trig_mode:1; + __u8 mask:1; + __u8 reserve:7; + __u8 reserved[4]; + __u8 dest_id; + } fields; + } redirtbl[KVM_IOAPIC_NUM_PINS]; +}; + +#define KVM_IRQCHIP_PIC_MASTER 0 +#define KVM_IRQCHIP_PIC_SLAVE 1 +#define KVM_IRQCHIP_IOAPIC 2 +#define KVM_NR_IRQCHIPS 3 + +/* for KVM_GET_REGS and KVM_SET_REGS */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +/* for KVM_GET_LAPIC and KVM_SET_LAPIC */ +#define KVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; +}; + +struct kvm_segment { + __u64 base; + __u32 limit; + __u16 selector; + __u8 type; + __u8 present, dpl, db, s, l, g, avl; + __u8 unusable; + __u8 padding; +}; + +struct kvm_dtable { + __u64 base; + __u16 limit; + __u16 padding[3]; +}; + + +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { + /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; +}; + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +/* for KVM_GET_MSRS and KVM_SET_MSRS */ +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +/* for KVM_GET_MSR_INDEX_LIST */ +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + + +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 +#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 +#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 + +/* for KVM_SET_CPUID2 */ +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +/* for KVM_GET_PIT and KVM_SET_PIT */ +struct kvm_pit_channel_state { + __u32 count; /* can be 65536 */ + __u16 latched_count; + __u8 count_latched; + __u8 status_latched; + __u8 status; + __u8 read_state; + __u8 write_state; + __u8 write_latch; + __u8 rw_mode; + __u8 mode; + __u8 bcd; + __u8 gate; + __s64 count_load_time; +}; + +struct kvm_debug_exit_arch { + __u32 exception; + __u32 pad; + __u64 pc; + __u64 dr6; + __u64 dr7; +}; + +#define KVM_GUESTDBG_USE_SW_BP 0x00010000 +#define KVM_GUESTDBG_USE_HW_BP 0x00020000 +#define KVM_GUESTDBG_INJECT_DB 0x00040000 +#define KVM_GUESTDBG_INJECT_BP 0x00080000 + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { + __u64 debugreg[8]; +}; + +struct kvm_pit_state { + struct kvm_pit_channel_state channels[3]; +}; + +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + +struct kvm_pit_state2 { + struct kvm_pit_channel_state channels[3]; + __u32 flags; + __u32 reserved[9]; +}; + +struct kvm_reinject_control { + __u8 pit_reinject; + __u8 reserved[31]; +}; + +/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ +#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 +#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 +#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 + +/* Interrupt shadow states */ +#define KVM_X86_SHADOW_INT_MOV_SS 0x01 +#define KVM_X86_SHADOW_INT_STI 0x02 + +/* for KVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pad; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 shadow; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; + __u32 reserved[10]; +}; + +/* for KVM_GET/SET_DEBUGREGS */ +struct kvm_debugregs { + __u64 db[4]; + __u64 dr6; + __u64 dr7; + __u64 flags; + __u64 reserved[9]; +}; + +/* for KVM_CAP_XSAVE */ +struct kvm_xsave { + __u32 region[1024]; +}; + +#define KVM_MAX_XCRS 16 + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + +#endif /* _ASM_X86_KVM_H */ diff --git a/linux/include/asm-x86/kvm_emulate.h b/linux/include/asm-x86/kvm_emulate.h new file mode 100644 index 0000000..a57ee7e --- /dev/null +++ b/linux/include/asm-x86/kvm_emulate.h @@ -0,0 +1,318 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/****************************************************************************** + * x86_emulate.h + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef _ASM_X86_KVM_X86_EMULATE_H +#define _ASM_X86_KVM_X86_EMULATE_H + + + +struct x86_emulate_ctxt; + +struct x86_exception { + u8 vector; + bool error_code_valid; + u16 error_code; + bool nested_page_fault; + u64 address; /* cr2 or nested page fault gpa */ +}; + +/* + * x86_emulate_ops: + * + * These operations represent the instruction emulator's interface to memory. + * There are two categories of operation: those that act on ordinary memory + * regions (*_std), and those that act on memory regions known to require + * special treatment or emulation (*_emulated). + * + * The emulator assumes that an instruction accesses only one 'emulated memory' + * location, that this location is the given linear faulting address (cr2), and + * that this is one of the instruction's data operands. Instruction fetches and + * stack operations are assumed never to access emulated memory. The emulator + * automatically deduces which operand of a string-move operation is accessing + * emulated memory, and assumes that the other operand accesses normal memory. + * + * NOTES: + * 1. The emulator isn't very smart about emulated vs. standard memory. + * 'Emulated memory' access addresses should be checked for sanity. + * 'Normal memory' accesses may fault, and the caller must arrange to + * detect and handle reentrancy into the emulator via recursive faults. + * Accesses may be unaligned and may cross page boundaries. + * 2. If the access fails (cannot emulate, or a standard access faults) then + * it is up to the memop to propagate the fault to the guest VM via + * some out-of-band mechanism, unknown to the emulator. The memop signals + * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will + * then immediately bail. + * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only + * cmpxchg8b_emulated need support 8-byte accesses. + * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. + */ +/* Access completed successfully: continue emulation as normal. */ +#define X86EMUL_CONTINUE 0 +/* Access is unhandleable: bail from emulation and return error to caller. */ +#define X86EMUL_UNHANDLEABLE 1 +/* Terminate emulation but return success to the caller. */ +#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ +#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ +#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ +#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ + +struct x86_emulate_ops { + /* + * read_std: Read bytes of standard (non-emulated/special) memory. + * Used for descriptor reading. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_std)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, + struct x86_exception *fault); + + /* + * write_std: Write bytes of standard (non-emulated/special) memory. + * Used for descriptor writing. + * @addr: [IN ] Linear address to which to write. + * @val: [OUT] Value write to memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_std)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, + struct x86_exception *fault); + /* + * fetch: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*fetch)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, + struct x86_exception *fault); + + /* + * read_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_emulated)(unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *fault, + struct kvm_vcpu *vcpu); + + /* + * write_emulated: Write bytes to emulated/special memory area. + * @addr: [IN ] Linear address to which to write. + * @val: [IN ] Value to write to memory (low-order bytes used as + * required). + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_emulated)(unsigned long addr, + const void *val, + unsigned int bytes, + struct x86_exception *fault, + struct kvm_vcpu *vcpu); + + /* + * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an + * emulated/special memory area. + * @addr: [IN ] Linear address to access. + * @old: [IN ] Value expected to be current at @addr. + * @new: [IN ] Value to write to @addr. + * @bytes: [IN ] Number of bytes to access using CMPXCHG. + */ + int (*cmpxchg_emulated)(unsigned long addr, + const void *old, + const void *new, + unsigned int bytes, + struct x86_exception *fault, + struct kvm_vcpu *vcpu); + + int (*pio_in_emulated)(int size, unsigned short port, void *val, + unsigned int count, struct kvm_vcpu *vcpu); + + int (*pio_out_emulated)(int size, unsigned short port, const void *val, + unsigned int count, struct kvm_vcpu *vcpu); + + bool (*get_cached_descriptor)(struct kvm_desc_struct *desc, + int seg, struct kvm_vcpu *vcpu); + void (*set_cached_descriptor)(struct kvm_desc_struct *desc, + int seg, struct kvm_vcpu *vcpu); + u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); + void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); + unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); + void (*get_gdt)(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu); + void (*get_idt)(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu); + ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); + int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); + int (*cpl)(struct kvm_vcpu *vcpu); + int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); + int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); + int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); +}; + +/* Type, address-of, and value of an instruction's operand. */ +struct operand { + enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; + unsigned int bytes; + union { + unsigned long orig_val; + u64 orig_val64; + }; + union { + unsigned long *reg; + struct segmented_address { + ulong ea; + unsigned seg; + } mem; + } addr; + union { + unsigned long val; + u64 val64; + char valptr[sizeof(unsigned long) + 2]; + }; +}; + +struct fetch_cache { + u8 data[15]; + unsigned long start; + unsigned long end; +}; + +struct read_cache { + u8 data[1024]; + unsigned long pos; + unsigned long end; +}; + +struct decode_cache { + u8 twobyte; + u8 b; + u8 lock_prefix; + u8 rep_prefix; + u8 op_bytes; + u8 ad_bytes; + u8 rex_prefix; + struct operand src; + struct operand src2; + struct operand dst; + bool has_seg_override; + u8 seg_override; + unsigned int d; + int (*execute)(struct x86_emulate_ctxt *ctxt); + unsigned long regs[NR_VCPU_REGS]; + unsigned long eip; + /* modrm */ + u8 modrm; + u8 modrm_mod; + u8 modrm_reg; + u8 modrm_rm; + u8 modrm_seg; + bool rip_relative; + struct fetch_cache fetch; + struct read_cache io_read; + struct read_cache mem_read; +}; + +struct x86_emulate_ctxt { + struct x86_emulate_ops *ops; + + /* Register state before/after emulation. */ + struct kvm_vcpu *vcpu; + + unsigned long eflags; + unsigned long eip; /* eip before instruction emulation */ + /* Emulated execution mode, represented by an X86EMUL_MODE value. */ + int mode; + u32 cs_base; + + /* interruptibility state, as a result of execution of STI or MOV SS */ + int interruptibility; + + bool perm_ok; /* do not check permissions if true */ + + bool have_exception; + struct x86_exception exception; + + /* decode cache */ + struct decode_cache decode; +}; + +/* Repeat String Operation Prefix */ +#define REPE_PREFIX 1 +#define REPNE_PREFIX 2 + +/* Execution mode, passed to the emulator. */ +#define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ +#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ +#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ +#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ + +/* Host execution mode. */ +#if defined(CONFIG_X86_32) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 +#elif defined(CONFIG_X86_64) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 +#endif + +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); +#define EMULATION_FAILED -1 +#define EMULATION_OK 0 +#define EMULATION_RESTART 1 +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); +int emulator_task_switch(struct x86_emulate_ctxt *ctxt, + u16 tss_selector, int reason, + bool has_error_code, u32 error_code); +int emulate_int_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq); +#endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/linux/include/asm-x86/kvm_host.h b/linux/include/asm-x86/kvm_host.h new file mode 100644 index 0000000..0c5e1f7 --- /dev/null +++ b/linux/include/asm-x86/kvm_host.h @@ -0,0 +1,888 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * Kernel-based Virtual Machine driver for Linux + * + * This header defines architecture specific interfaces, x86 version + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef _ASM_X86_KVM_HOST_H +#define _ASM_X86_KVM_HOST_H + +#include <asm/types.h> +#include <linux/mm.h> +#include <linux/mmu_notifier.h> +#include <linux/tracepoint.h> +#include <linux/cpumask.h> + +#include <linux/kvm.h> +#include <linux/kvm_para.h> +#include <linux/kvm_types.h> + +#include <asm/pvclock-abi.h> +#include <asm/desc.h> +#include <asm/mtrr.h> +#include <asm/msr-index.h> + +#define KVM_MAX_VCPUS 64 +#define KVM_MEMORY_SLOTS 32 +/* memory slots that does not exposed to userspace */ +#define KVM_PRIVATE_MEM_SLOTS 4 + +#define KVM_PIO_PAGE_OFFSET 1 +#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 + +#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) +#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) +#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ + 0xFFFFFF0000000000ULL) + +#define INVALID_PAGE (~(hpa_t)0) +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + +#define UNMAPPED_GVA (~(gpa_t)0) + +/* KVM Hugepage definitions for x86 */ +#define KVM_NR_PAGE_SIZES 3 +#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) +#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) +#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) +#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) + +#define DE_VECTOR 0 +#define DB_VECTOR 1 +#define BP_VECTOR 3 +#define OF_VECTOR 4 +#define BR_VECTOR 5 +#define UD_VECTOR 6 +#define NM_VECTOR 7 +#define DF_VECTOR 8 +#define TS_VECTOR 10 +#define NP_VECTOR 11 +#define SS_VECTOR 12 +#define GP_VECTOR 13 +#define PF_VECTOR 14 +#define MF_VECTOR 16 +#define MC_VECTOR 18 + +#define SELECTOR_TI_MASK (1 << 2) +#define SELECTOR_RPL_MASK 0x03 + +#define IOPL_SHIFT 12 + +#define KVM_PERMILLE_MMU_PAGES 20 +#define KVM_MIN_ALLOC_MMU_PAGES 64 +#define KVM_MMU_HASH_SHIFT 10 +#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) +#define KVM_MIN_FREE_MMU_PAGES 5 +#define KVM_REFILL_PAGES 25 +#define KVM_MAX_CPUID_ENTRIES 80 +#define KVM_NR_FIXED_MTRR_REGION 88 +#define KVM_NR_VAR_MTRR 8 + +#define ASYNC_PF_PER_VCPU 64 + +extern spinlock_t kvm_lock; +extern struct list_head vm_list; + +struct kvm_vcpu; +struct kvm; +struct kvm_async_pf; + +enum kvm_reg { + VCPU_REGS_RAX = 0, + VCPU_REGS_RCX = 1, + VCPU_REGS_RDX = 2, + VCPU_REGS_RBX = 3, + VCPU_REGS_RSP = 4, + VCPU_REGS_RBP = 5, + VCPU_REGS_RSI = 6, + VCPU_REGS_RDI = 7, +#ifdef CONFIG_X86_64 + VCPU_REGS_R8 = 8, + VCPU_REGS_R9 = 9, + VCPU_REGS_R10 = 10, + VCPU_REGS_R11 = 11, + VCPU_REGS_R12 = 12, + VCPU_REGS_R13 = 13, + VCPU_REGS_R14 = 14, + VCPU_REGS_R15 = 15, +#endif + VCPU_REGS_RIP, + NR_VCPU_REGS +}; + +enum kvm_reg_ex { + VCPU_EXREG_PDPTR = NR_VCPU_REGS, + VCPU_EXREG_CR3, +}; + +enum { + VCPU_SREG_ES, + VCPU_SREG_CS, + VCPU_SREG_SS, + VCPU_SREG_DS, + VCPU_SREG_FS, + VCPU_SREG_GS, + VCPU_SREG_TR, + VCPU_SREG_LDTR, +}; + +#include <asm/kvm_emulate.h> + +#define KVM_NR_MEM_OBJS 40 + +#define KVM_NR_DB_REGS 4 + +#define DR6_BD (1 << 13) +#define DR6_BS (1 << 14) +#define DR6_FIXED_1 0xffff0ff0 +#define DR6_VOLATILE 0x0000e00f + +#define DR7_BP_EN_MASK 0x000000ff +#define DR7_GE (1 << 9) +#define DR7_GD (1 << 13) +#define DR7_FIXED_1 0x00000400 +#define DR7_VOLATILE 0xffff23ff + +/* + * We don't want allocation failures within the mmu code, so we preallocate + * enough memory for a single page fault in a cache. + */ +struct kvm_mmu_memory_cache { + int nobjs; + void *objects[KVM_NR_MEM_OBJS]; +}; + +#define NR_PTE_CHAIN_ENTRIES 5 + +struct kvm_pte_chain { + u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; + struct hlist_node link; +}; + +/* + * kvm_mmu_page_role, below, is defined as: + * + * bits 0:3 - total guest paging levels (2-4, or zero for real mode) + * bits 4:7 - page table level for this shadow (1-4) + * bits 8:9 - page table quadrant for 2-level guests + * bit 16 - direct mapping of virtual to physical mapping at gfn + * used for real mode and two-dimensional paging + * bits 17:19 - common access permissions for all ptes in this shadow page + */ +union kvm_mmu_page_role { + unsigned word; + struct { + unsigned level:4; + unsigned cr4_pae:1; + unsigned quadrant:2; + unsigned pad_for_nice_hex_output:6; + unsigned direct:1; + unsigned access:3; + unsigned invalid:1; + unsigned nxe:1; + unsigned cr0_wp:1; + }; +}; + +struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; + + /* + * The following two entries are used to key the shadow page in the + * hash table. + */ + gfn_t gfn; + union kvm_mmu_page_role role; + + u64 *spt; + /* hold the gfn of each spte inside spt */ + gfn_t *gfns; + /* + * One bit set per slot which has memory + * in this shadow page. + */ + DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); + bool multimapped; /* More than one parent_pte? */ + bool unsync; + int root_count; /* Currently serving as active root */ + unsigned int unsync_children; + union { + u64 *parent_pte; /* !multimapped */ + struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ + }; + DECLARE_BITMAP(unsync_child_bitmap, 512); +}; + +struct kvm_pv_mmu_op_buffer { + void *ptr; + unsigned len; + unsigned processed; + char buf[512] __aligned(sizeof(long)); +}; + +struct kvm_pio_request { + unsigned long count; + int in; + int port; + int size; +}; + +/* + * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level + * 32-bit). The kvm_mmu structure abstracts the details of the current mmu + * mode. + */ +struct kvm_mmu { + void (*new_cr3)(struct kvm_vcpu *vcpu); + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); + unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); + int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, + bool prefault); + void (*inject_page_fault)(struct kvm_vcpu *vcpu, + struct x86_exception *fault); + void (*free)(struct kvm_vcpu *vcpu); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, + struct x86_exception *exception); + gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); + void (*prefetch_page)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page); + int (*sync_page)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp); + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); + hpa_t root_hpa; + int root_level; + int shadow_root_level; + union kvm_mmu_page_role base_role; + bool direct_map; + + u64 *pae_root; + u64 *lm_root; + u64 rsvd_bits_mask[2][4]; + + bool nx; + + u64 pdptrs[4]; /* pae */ +}; + +struct kvm_vcpu_arch { + /* + * rip and regs accesses must go through + * kvm_{register,rip}_{read,write} functions. + */ + unsigned long regs[NR_VCPU_REGS]; + u32 regs_avail; + u32 regs_dirty; + + unsigned long cr0; + unsigned long cr0_guest_owned_bits; + unsigned long cr2; + unsigned long cr3; + unsigned long cr4; + unsigned long cr4_guest_owned_bits; + unsigned long cr8; + u32 hflags; + u64 efer; + u64 apic_base; + struct kvm_lapic *apic; /* kernel irqchip context */ + int32_t apic_arb_prio; + int mp_state; + int sipi_vector; + u64 ia32_misc_enable_msr; + bool tpr_access_reporting; + + /* + * Paging state of the vcpu + * + * If the vcpu runs in guest mode with two level paging this still saves + * the paging mode of the l1 guest. This context is always used to + * handle faults. + */ + struct kvm_mmu mmu; + + /* + * Paging state of an L2 guest (used for nested npt) + * + * This context will save all necessary information to walk page tables + * of the an L2 guest. This context is only initialized for page table + * walking and not for faulting since we never handle l2 page faults on + * the host. + */ + struct kvm_mmu nested_mmu; + + /* + * Pointer to the mmu context currently used for + * gva_to_gpa translations. + */ + struct kvm_mmu *walk_mmu; + + /* only needed in kvm_pv_mmu_op() path, but it's hot so + * put it here to avoid allocation */ + struct kvm_pv_mmu_op_buffer mmu_op_buffer; + + struct kvm_mmu_memory_cache mmu_pte_chain_cache; + struct kvm_mmu_memory_cache mmu_rmap_desc_cache; + struct kvm_mmu_memory_cache mmu_page_cache; + struct kvm_mmu_memory_cache mmu_page_header_cache; + + gfn_t last_pt_write_gfn; + int last_pt_write_count; + u64 *last_pte_updated; + gfn_t last_pte_gfn; + + struct { + gfn_t gfn; /* presumed gfn during guest pte update */ + pfn_t pfn; /* pfn corresponding to that gfn */ + unsigned long mmu_seq; + } update_pte; + + struct kvm_compat_fpu guest_fpu; + u64 xcr0; + + gva_t mmio_fault_cr2; + struct kvm_pio_request pio; + void *pio_data; + + u8 event_exit_inst_len; + + struct kvm_queued_exception { + bool pending; + bool has_error_code; + bool reinject; + u8 nr; + u32 error_code; + } exception; + + struct kvm_queued_interrupt { + bool pending; + bool soft; + u8 nr; + } interrupt; + + int halt_request; /* real mode on Intel only */ + + int cpuid_nent; + struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + /* emulate context */ + + struct x86_emulate_ctxt emulate_ctxt; + + gpa_t time; + struct kvm_pvclock_vcpu_time_info hv_clock; + unsigned int hw_tsc_khz; + unsigned int time_offset; + struct page *time_page; + u64 last_host_tsc; + u64 last_guest_tsc; + u64 last_kernel_ns; + u64 last_tsc_nsec; + u64 last_tsc_write; + bool tsc_catchup; + + bool nmi_pending; + bool nmi_injected; + + struct mtrr_state_type mtrr_state; + u32 pat; + + int switch_db_regs; + unsigned long db[KVM_NR_DB_REGS]; + unsigned long dr6; + unsigned long dr7; + unsigned long eff_db[KVM_NR_DB_REGS]; + + u64 mcg_cap; + u64 mcg_status; + u64 mcg_ctl; + u64 *mce_banks; + + /* used for guest single stepping over the given code position */ + unsigned long singlestep_rip; + + /* fields used by HYPER-V emulation */ + u64 hv_vapic; + + cpumask_var_t wbinvd_dirty_mask; + + struct { + bool halted; + gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; + struct gfn_to_hva_cache data; + u64 msr_val; + u32 id; + bool send_user_only; + } apf; +}; + +struct kvm_arch { + unsigned int n_used_mmu_pages; + unsigned int n_requested_mmu_pages; + unsigned int n_max_mmu_pages; + atomic_t invlpg_counter; + struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; + /* + * Hash table of struct kvm_mmu_page. + */ + struct list_head active_mmu_pages; + struct list_head assigned_dev_head; + struct iommu_domain *iommu_domain; + int iommu_flags; + struct kvm_pic *vpic; + struct kvm_ioapic *vioapic; + struct kvm_pit *vpit; + int vapics_in_nmi_mode; + + unsigned int tss_addr; + struct page *apic_access_page; + + gpa_t wall_clock; + + struct page *ept_identity_pagetable; + bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; + + unsigned long irq_sources_bitmap; + s64 kvmclock_offset; + spinlock_t tsc_write_lock; + u64 last_tsc_nsec; + u64 last_tsc_offset; + u64 last_tsc_write; + u32 virtual_tsc_khz; + u32 virtual_tsc_mult; + s8 virtual_tsc_shift; + + struct kvm_xen_hvm_config xen_hvm_config; + + /* fields used by HYPER-V emulation */ + u64 hv_guest_os_id; + u64 hv_hypercall; + + #ifdef CONFIG_KVM_MMU_AUDIT + int audit_point; + #endif +}; + +struct kvm_vm_stat { + u32 mmu_shadow_zapped; + u32 mmu_pte_write; + u32 mmu_pte_updated; + u32 mmu_pde_zapped; + u32 mmu_flooded; + u32 mmu_recycled; + u32 mmu_cache_miss; + u32 mmu_unsync; + u32 remote_tlb_flush; + u32 lpages; +}; + +struct kvm_vcpu_stat { + u32 pf_fixed; + u32 pf_guest; + u32 tlb_flush; + u32 invlpg; + + u32 exits; + u32 io_exits; + u32 mmio_exits; + u32 signal_exits; + u32 irq_window_exits; + u32 nmi_window_exits; + u32 halt_exits; + u32 halt_wakeup; + u32 request_irq_exits; + u32 irq_exits; + u32 host_state_reload; + u32 efer_reload; + u32 fpu_reload; + u32 insn_emulation; + u32 insn_emulation_fail; + u32 hypercalls; + u32 irq_injections; + u32 nmi_injections; +}; + +struct kvm_x86_ops { + int (*cpu_has_kvm_support)(void); /* __init */ + int (*disabled_by_bios)(void); /* __init */ + int (*hardware_enable)(void *dummy); + void (*hardware_disable)(void *dummy); + void (*check_processor_compatibility)(void *rtn); + int (*hardware_setup)(void); /* __init */ + void (*hardware_unsetup)(void); /* __exit */ + bool (*cpu_has_accelerated_tpr)(void); + void (*cpuid_update)(struct kvm_vcpu *vcpu); + + /* Create, but do not attach this VCPU */ + struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); + void (*vcpu_free)(struct kvm_vcpu *vcpu); + int (*vcpu_reset)(struct kvm_vcpu *vcpu); + + void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); + void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); + void (*vcpu_put)(struct kvm_vcpu *vcpu); + + void (*set_guest_debug)(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg); + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); + void (*get_segment)(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); + int (*get_cpl)(struct kvm_vcpu *vcpu); + void (*set_segment)(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); + void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); + void (*decache_cr3)(struct kvm_vcpu *vcpu); + void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); + void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); + void (*get_idt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt); + void (*set_idt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt); + void (*get_gdt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt); + void (*set_gdt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt); + void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); + void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); + unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); + void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_activate)(struct kvm_vcpu *vcpu); + void (*fpu_deactivate)(struct kvm_vcpu *vcpu); + + void (*tlb_flush)(struct kvm_vcpu *vcpu); + + void (*run)(struct kvm_vcpu *vcpu); + int (*handle_exit)(struct kvm_vcpu *vcpu); + void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); + u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); + void (*patch_hypercall)(struct kvm_vcpu *vcpu, + unsigned char *hypercall_addr); + void (*set_irq)(struct kvm_vcpu *vcpu); + void (*set_nmi)(struct kvm_vcpu *vcpu); + void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code, + bool reinject); + void (*cancel_injection)(struct kvm_vcpu *vcpu); + int (*interrupt_allowed)(struct kvm_vcpu *vcpu); + int (*nmi_allowed)(struct kvm_vcpu *vcpu); + bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); + void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); + void (*enable_nmi_window)(struct kvm_vcpu *vcpu); + void (*enable_irq_window)(struct kvm_vcpu *vcpu); + void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); + int (*get_tdp_level)(void); + u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + int (*get_lpage_level)(void); + bool (*rdtscp_supported)(void); + void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); + + void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); + + void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); + + bool (*has_wbinvd_exit)(void); + + void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + + void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); + const struct trace_print_flags *exit_reasons_str; +}; + +struct kvm_arch_async_pf { + u32 token; + gfn_t gfn; + unsigned long cr3; + bool direct_map; +}; + +extern struct kvm_x86_ops *kvm_x86_ops; + +int kvm_mmu_module_init(void); +void kvm_mmu_module_exit(void); + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu); +int kvm_mmu_create(struct kvm_vcpu *vcpu); +int kvm_mmu_setup(struct kvm_vcpu *vcpu); +void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); +void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, + u64 dirty_mask, u64 nx_mask, u64 x_mask); + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_zap_all(struct kvm *kvm); +unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); + +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); + +int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes); +int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, + gpa_t addr, unsigned long *ret); +u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); + +extern bool tdp_enabled; + +enum emulation_result { + EMULATE_DONE, /* no further processing */ + EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ + EMULATE_FAIL, /* can't emulate this instruction */ +}; + +#define EMULTYPE_NO_DECODE (1 << 0) +#define EMULTYPE_TRAP_UD (1 << 1) +#define EMULTYPE_SKIP (1 << 2) +int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, + int emulation_type, void *insn, int insn_len); + +static inline int emulate_instruction(struct kvm_vcpu *vcpu, + int emulation_type) +{ + return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); +} + +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); +void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); + +void kvm_enable_efer_bits(u64); +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + +struct x86_emulate_ctxt; + +int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); +int kvm_emulate_halt(struct kvm_vcpu *vcpu); +int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); +int emulate_clts(struct kvm_vcpu *vcpu); +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); + +void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); + +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, + bool has_error_code, u32 error_code); + +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); +unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); +void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); +void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); + +unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); + +void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); +void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); +void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); +int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gfn_t gfn, void *data, int offset, int len, + u32 access); +void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); +bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); + +int kvm_pic_set_irq(void *opaque, int irq, int level); + +void kvm_inject_nmi(struct kvm_vcpu *vcpu); + +int fx_init(struct kvm_vcpu *vcpu); + +void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes, + bool guest_initiated); +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); +void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); +int kvm_mmu_load(struct kvm_vcpu *vcpu); +void kvm_mmu_unload(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception); + +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); + +int kvm_fix_hypercall(struct kvm_vcpu *vcpu); + +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, + void *insn, int insn_len); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); + +void kvm_enable_tdp(void); +void kvm_disable_tdp(void); + +int complete_pio(struct kvm_vcpu *vcpu); +bool kvm_check_iopl(struct kvm_vcpu *vcpu); + +static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) +{ + struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page_private(page); +} + +static inline u16 kvm_read_ldt(void) +{ + u16 ldt; + asm("sldt %0" : "=g"(ldt)); + return ldt; +} + +static inline void kvm_load_ldt(u16 sel) +{ + asm("lldt %0" : : "rm"(sel)); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long read_msr(unsigned long msr) +{ + u64 value; + + rdmsrl(msr, value); + return value; +} +#endif + +static inline u32 get_rdx_init_val(void) +{ + return 0x600; /* P6 family */ +} + +static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) +{ + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); +} + +#define TSS_IOPB_BASE_OFFSET 0x66 +#define TSS_BASE_SIZE 0x68 +#define TSS_IOPB_SIZE (65536 / 8) +#define TSS_REDIRECTION_SIZE (256 / 8) +#define RMODE_TSS_SIZE \ + (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) + +enum { + TASK_SWITCH_CALL = 0, + TASK_SWITCH_IRET = 1, + TASK_SWITCH_JMP = 2, + TASK_SWITCH_GATE = 3, +}; + +#define HF_GIF_MASK (1 << 0) +#define HF_HIF_MASK (1 << 1) +#define HF_VINTR_MASK (1 << 2) +#define HF_NMI_MASK (1 << 3) +#define HF_IRET_MASK (1 << 4) +#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ + +/* + * Hardware virtualization extension instructions may fault if a + * reboot turns off virtualization while processes are running. + * Trap the fault and ignore the instruction if that happens. + */ +asmlinkage void kvm_spurious_fault(void); +extern bool kvm_rebooting; + +#define __kvm_handle_fault_on_reboot(insn) \ + "666: " insn "\n\t" \ + "668: \n\t" \ + ".pushsection .fixup, \"ax\" \n" \ + "667: \n\t" \ + "cmpb $0, kvm_rebooting \n\t" \ + "jne 668b \n\t" \ + __ASM_SIZE(push) " $666b \n\t" \ + "call kvm_spurious_fault \n\t" \ + ".popsection \n\t" \ + ".pushsection __ex_table, \"a\" \n\t" \ + _ASM_PTR " 666b, 667b \n\t" \ + ".popsection" + +#define KVM_ARCH_WANT_MMU_NOTIFIER +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); +int kvm_cpu_get_interrupt(struct kvm_vcpu *v); + +void kvm_define_shared_msr(unsigned index, u32 msr); +void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); + +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); + +void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); +void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); +bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); +extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); + +void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); + +#endif /* _ASM_X86_KVM_HOST_H */ diff --git a/linux/include/asm-x86/kvm_para.h b/linux/include/asm-x86/kvm_para.h new file mode 100644 index 0000000..5f575a4 --- /dev/null +++ b/linux/include/asm-x86/kvm_para.h @@ -0,0 +1,233 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#ifndef _ASM_X86_KVM_PARA_H +#define _ASM_X86_KVM_PARA_H + +#include <linux/types.h> +#include <asm/hyperv.h> + +/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It + * should be used to determine that a VM is running under KVM. + */ +#define KVM_CPUID_SIGNATURE 0x40000000 + +/* This CPUID returns a feature bitmap in eax. Before enabling a particular + * paravirtualization, the appropriate feature bit should be checked. + */ +#define KVM_CPUID_FEATURES 0x40000001 +#define KVM_FEATURE_CLOCKSOURCE 0 +#define KVM_FEATURE_NOP_IO_DELAY 1 +#define KVM_FEATURE_MMU_OP 2 +/* This indicates that the new set of kvmclock msrs + * are available. The use of 0x11 and 0x12 is deprecated + */ +#define KVM_FEATURE_CLOCKSOURCE2 3 +#define KVM_FEATURE_ASYNC_PF 4 + +/* The last 8 bits are used to indicate how to interpret the flags field + * in pvclock structure. If no bits are set, all flags are ignored. + */ +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 + +#define MSR_KVM_WALL_CLOCK 0x11 +#define MSR_KVM_SYSTEM_TIME 0x12 + +/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ +#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 +#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 +#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 + +#define KVM_MAX_MMU_OP_BATCH 32 + +#define KVM_ASYNC_PF_ENABLED (1 << 0) +#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) + +/* Operations for KVM_HC_MMU_OP */ +#define KVM_MMU_OP_WRITE_PTE 1 +#define KVM_MMU_OP_FLUSH_TLB 2 +#define KVM_MMU_OP_RELEASE_PT 3 + +/* Payload for KVM_HC_MMU_OP */ +struct kvm_mmu_op_header { + __u32 op; + __u32 pad; +}; + +struct kvm_mmu_op_write_pte { + struct kvm_mmu_op_header header; + __u64 pte_phys; + __u64 pte_val; +}; + +struct kvm_mmu_op_flush_tlb { + struct kvm_mmu_op_header header; +}; + +struct kvm_mmu_op_release_pt { + struct kvm_mmu_op_header header; + __u64 pt_phys; +}; + +#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 +#define KVM_PV_REASON_PAGE_READY 2 + +struct kvm_vcpu_pv_apf_data { + __u32 reason; + __u8 pad[60]; + __u32 enabled; +}; + +#ifdef __KERNEL__ +#include <asm/processor.h> + +extern void kvmclock_init(void); +extern int kvm_register_clock(char *txt); + + +/* This instruction is vmcall. On non-VT architectures, it will generate a + * trap that we will then rewrite to the appropriate instruction. + */ +#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" + +/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun + * instruction. The hypervisor may replace it with something else but only the + * instructions are guaranteed to be supported. + * + * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. + * The hypercall number should be placed in rax and the return value will be + * placed in rax. No other registers will be clobbered unless explicited + * noted by the particular hypercall. + */ + +static inline long kvm_hypercall0(unsigned int nr) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr) + : "memory"); + return ret; +} + +static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1) + : "memory"); + return ret; +} + +static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, + unsigned long p2) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2) + : "memory"); + return ret; +} + +static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2), "d"(p3) + : "memory"); + return ret; +} + +static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3, + unsigned long p4) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4) + : "memory"); + return ret; +} + +static inline int kvm_para_available(void) +{ + unsigned int eax, ebx, ecx, edx; + char signature[13]; + + cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + + if (strcmp(signature, "KVMKVMKVM") == 0) + return 1; + + return 0; +} + +static inline unsigned int kvm_arch_para_features(void) +{ + return cpuid_eax(KVM_CPUID_FEATURES); +} + +#ifdef CONFIG_KVM_GUEST +void __init kvm_guest_init(void); +void kvm_async_pf_task_wait(u32 token); +void kvm_async_pf_task_wake(u32 token); +u32 kvm_read_and_reset_pf_reason(void); +#else +#define kvm_guest_init() do { } while (0) +#define kvm_async_pf_task_wait(T) do {} while(0) +#define kvm_async_pf_task_wake(T) do {} while(0) +static inline u32 kvm_read_and_reset_pf_reason(void) +{ + return 0; +} +#endif + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_KVM_PARA_H */ diff --git a/linux/include/asm-x86/svm.h b/linux/include/asm-x86/svm.h new file mode 100644 index 0000000..5529318 --- /dev/null +++ b/linux/include/asm-x86/svm.h @@ -0,0 +1,392 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#ifndef __SVM_H +#define __SVM_H + +enum { + INTERCEPT_INTR, + INTERCEPT_NMI, + INTERCEPT_SMI, + INTERCEPT_INIT, + INTERCEPT_VINTR, + INTERCEPT_SELECTIVE_CR0, + INTERCEPT_STORE_IDTR, + INTERCEPT_STORE_GDTR, + INTERCEPT_STORE_LDTR, + INTERCEPT_STORE_TR, + INTERCEPT_LOAD_IDTR, + INTERCEPT_LOAD_GDTR, + INTERCEPT_LOAD_LDTR, + INTERCEPT_LOAD_TR, + INTERCEPT_RDTSC, + INTERCEPT_RDPMC, + INTERCEPT_PUSHF, + INTERCEPT_POPF, + INTERCEPT_CPUID, + INTERCEPT_RSM, + INTERCEPT_IRET, + INTERCEPT_INTn, + INTERCEPT_INVD, + INTERCEPT_PAUSE, + INTERCEPT_HLT, + INTERCEPT_INVLPG, + INTERCEPT_INVLPGA, + INTERCEPT_IOIO_PROT, + INTERCEPT_MSR_PROT, + INTERCEPT_TASK_SWITCH, + INTERCEPT_FERR_FREEZE, + INTERCEPT_SHUTDOWN, + INTERCEPT_VMRUN, + INTERCEPT_VMMCALL, + INTERCEPT_VMLOAD, + INTERCEPT_VMSAVE, + INTERCEPT_STGI, + INTERCEPT_CLGI, + INTERCEPT_SKINIT, + INTERCEPT_RDTSCP, + INTERCEPT_ICEBP, + INTERCEPT_WBINVD, + INTERCEPT_MONITOR, + INTERCEPT_MWAIT, + INTERCEPT_MWAIT_COND, + INTERCEPT_XSETBV, +}; + + +struct __attribute__ ((__packed__)) vmcb_control_area { + u32 intercept_cr; + u32 intercept_dr; + u32 intercept_exceptions; + u64 intercept; + u8 reserved_1[42]; + u16 pause_filter_count; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u8 reserved_2[3]; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u8 reserved_3[4]; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u8 reserved_4[16]; + u32 event_inj; + u32 event_inj_err; + u64 nested_cr3; + u64 lbr_ctl; + u32 clean; + u32 reserved_5; + u64 next_rip; + u8 insn_len; + u8 insn_bytes[15]; + u8 reserved_6[800]; +}; + + +#define TLB_CONTROL_DO_NOTHING 0 +#define TLB_CONTROL_FLUSH_ALL_ASID 1 +#define TLB_CONTROL_FLUSH_ASID 3 +#define TLB_CONTROL_FLUSH_ASID_LOCAL 7 + +#define V_TPR_MASK 0x0f + +#define V_IRQ_SHIFT 8 +#define V_IRQ_MASK (1 << V_IRQ_SHIFT) + +#define V_INTR_PRIO_SHIFT 16 +#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) + +#define V_IGN_TPR_SHIFT 20 +#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) + +#define V_INTR_MASKING_SHIFT 24 +#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) + +#define SVM_INTERRUPT_SHADOW_MASK 1 + +#define SVM_IOIO_STR_SHIFT 2 +#define SVM_IOIO_REP_SHIFT 3 +#define SVM_IOIO_SIZE_SHIFT 4 +#define SVM_IOIO_ASIZE_SHIFT 7 + +#define SVM_IOIO_TYPE_MASK 1 +#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) +#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) +#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) +#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) + +#define SVM_VM_CR_VALID_MASK 0x001fULL +#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL +#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL + +struct __attribute__ ((__packed__)) vmcb_seg { + u16 selector; + u16 attrib; + u32 limit; + u64 base; +}; + +struct __attribute__ ((__packed__)) vmcb_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + u8 reserved_1[43]; + u8 cpl; + u8 reserved_2[4]; + u64 efer; + u8 reserved_3[112]; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u8 reserved_4[88]; + u64 rsp; + u8 reserved_5[24]; + u64 rax; + u64 star; + u64 lstar; + u64 cstar; + u64 sfmask; + u64 kernel_gs_base; + u64 sysenter_cs; + u64 sysenter_esp; + u64 sysenter_eip; + u64 cr2; + u8 reserved_6[32]; + u64 g_pat; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; +}; + +struct __attribute__ ((__packed__)) vmcb { + struct vmcb_control_area control; + struct vmcb_save_area save; +}; + +#define SVM_CPUID_FEATURE_SHIFT 2 +#define SVM_CPUID_FUNC 0x8000000a + +#define SVM_VM_CR_SVM_DISABLE 4 + +#define SVM_SELECTOR_S_SHIFT 4 +#define SVM_SELECTOR_DPL_SHIFT 5 +#define SVM_SELECTOR_P_SHIFT 7 +#define SVM_SELECTOR_AVL_SHIFT 8 +#define SVM_SELECTOR_L_SHIFT 9 +#define SVM_SELECTOR_DB_SHIFT 10 +#define SVM_SELECTOR_G_SHIFT 11 + +#define SVM_SELECTOR_TYPE_MASK (0xf) +#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) +#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) +#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) +#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) +#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) +#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) +#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) + +#define SVM_SELECTOR_WRITE_MASK (1 << 1) +#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK +#define SVM_SELECTOR_CODE_MASK (1 << 3) + +#define INTERCEPT_CR0_READ 0 +#define INTERCEPT_CR3_READ 3 +#define INTERCEPT_CR4_READ 4 +#define INTERCEPT_CR8_READ 8 +#define INTERCEPT_CR0_WRITE (16 + 0) +#define INTERCEPT_CR3_WRITE (16 + 3) +#define INTERCEPT_CR4_WRITE (16 + 4) +#define INTERCEPT_CR8_WRITE (16 + 8) + +#define INTERCEPT_DR0_READ 0 +#define INTERCEPT_DR1_READ 1 +#define INTERCEPT_DR2_READ 2 +#define INTERCEPT_DR3_READ 3 +#define INTERCEPT_DR4_READ 4 +#define INTERCEPT_DR5_READ 5 +#define INTERCEPT_DR6_READ 6 +#define INTERCEPT_DR7_READ 7 +#define INTERCEPT_DR0_WRITE (16 + 0) +#define INTERCEPT_DR1_WRITE (16 + 1) +#define INTERCEPT_DR2_WRITE (16 + 2) +#define INTERCEPT_DR3_WRITE (16 + 3) +#define INTERCEPT_DR4_WRITE (16 + 4) +#define INTERCEPT_DR5_WRITE (16 + 5) +#define INTERCEPT_DR6_WRITE (16 + 6) +#define INTERCEPT_DR7_WRITE (16 + 7) + +#define SVM_EVTINJ_VEC_MASK 0xff + +#define SVM_EVTINJ_TYPE_SHIFT 8 +#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_VALID (1 << 31) +#define SVM_EVTINJ_VALID_ERR (1 << 11) + +#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK +#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK + +#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR +#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI +#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT +#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT + +#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID +#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR + +#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 +#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 +#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 + +#define SVM_EXITINFO_REG_MASK 0x0F + +#define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR3 0x003 +#define SVM_EXIT_READ_CR4 0x004 +#define SVM_EXIT_READ_CR8 0x008 +#define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR3 0x013 +#define SVM_EXIT_WRITE_CR4 0x014 +#define SVM_EXIT_WRITE_CR8 0x018 +#define SVM_EXIT_READ_DR0 0x020 +#define SVM_EXIT_READ_DR1 0x021 +#define SVM_EXIT_READ_DR2 0x022 +#define SVM_EXIT_READ_DR3 0x023 +#define SVM_EXIT_READ_DR4 0x024 +#define SVM_EXIT_READ_DR5 0x025 +#define SVM_EXIT_READ_DR6 0x026 +#define SVM_EXIT_READ_DR7 0x027 +#define SVM_EXIT_WRITE_DR0 0x030 +#define SVM_EXIT_WRITE_DR1 0x031 +#define SVM_EXIT_WRITE_DR2 0x032 +#define SVM_EXIT_WRITE_DR3 0x033 +#define SVM_EXIT_WRITE_DR4 0x034 +#define SVM_EXIT_WRITE_DR5 0x035 +#define SVM_EXIT_WRITE_DR6 0x036 +#define SVM_EXIT_WRITE_DR7 0x037 +#define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_INTR 0x060 +#define SVM_EXIT_NMI 0x061 +#define SVM_EXIT_SMI 0x062 +#define SVM_EXIT_INIT 0x063 +#define SVM_EXIT_VINTR 0x064 +#define SVM_EXIT_CR0_SEL_WRITE 0x065 +#define SVM_EXIT_IDTR_READ 0x066 +#define SVM_EXIT_GDTR_READ 0x067 +#define SVM_EXIT_LDTR_READ 0x068 +#define SVM_EXIT_TR_READ 0x069 +#define SVM_EXIT_IDTR_WRITE 0x06a +#define SVM_EXIT_GDTR_WRITE 0x06b +#define SVM_EXIT_LDTR_WRITE 0x06c +#define SVM_EXIT_TR_WRITE 0x06d +#define SVM_EXIT_RDTSC 0x06e +#define SVM_EXIT_RDPMC 0x06f +#define SVM_EXIT_PUSHF 0x070 +#define SVM_EXIT_POPF 0x071 +#define SVM_EXIT_CPUID 0x072 +#define SVM_EXIT_RSM 0x073 +#define SVM_EXIT_IRET 0x074 +#define SVM_EXIT_SWINT 0x075 +#define SVM_EXIT_INVD 0x076 +#define SVM_EXIT_PAUSE 0x077 +#define SVM_EXIT_HLT 0x078 +#define SVM_EXIT_INVLPG 0x079 +#define SVM_EXIT_INVLPGA 0x07a +#define SVM_EXIT_IOIO 0x07b +#define SVM_EXIT_MSR 0x07c +#define SVM_EXIT_TASK_SWITCH 0x07d +#define SVM_EXIT_FERR_FREEZE 0x07e +#define SVM_EXIT_SHUTDOWN 0x07f +#define SVM_EXIT_VMRUN 0x080 +#define SVM_EXIT_VMMCALL 0x081 +#define SVM_EXIT_VMLOAD 0x082 +#define SVM_EXIT_VMSAVE 0x083 +#define SVM_EXIT_STGI 0x084 +#define SVM_EXIT_CLGI 0x085 +#define SVM_EXIT_SKINIT 0x086 +#define SVM_EXIT_RDTSCP 0x087 +#define SVM_EXIT_ICEBP 0x088 +#define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_MONITOR 0x08a +#define SVM_EXIT_MWAIT 0x08b +#define SVM_EXIT_MWAIT_COND 0x08c +#define SVM_EXIT_XSETBV 0x08d +#define SVM_EXIT_NPF 0x400 + +#define SVM_EXIT_ERR -1 + +#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) + +#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" +#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" +#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" +#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" +#define SVM_STGI ".byte 0x0f, 0x01, 0xdc" +#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" + +#endif + diff --git a/linux/include/asm-x86/virtext.h b/linux/include/asm-x86/virtext.h new file mode 100644 index 0000000..e6f17fb --- /dev/null +++ b/linux/include/asm-x86/virtext.h @@ -0,0 +1,172 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* CPU virtualization extensions handling + * + * This should carry the code for handling CPU virtualization extensions + * that needs to live in the kernel core. + * + * Author: Eduardo Habkost <ehabkost@redhat.com> + * + * Copyright (C) 2008, Red Hat Inc. + * + * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ +#ifndef _ASM_X86_VIRTEX_H +#define _ASM_X86_VIRTEX_H + +#include <asm/processor.h> +#include <asm/system.h> + +#include <asm/vmx.h> +#include <asm/svm.h> + +/* + * VMX functions: + */ + +static inline int cpu_has_vmx(void) +{ + unsigned long ecx = cpuid_ecx(1); + return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ +} + + +/** Disable VMX on the current CPU + * + * vmxoff causes a undefined-opcode exception if vmxon was not run + * on the CPU previously. Only call this function if you know VMX + * is enabled. + */ +static inline void cpu_vmxoff(void) +{ + asm volatile (ASM_VMX_VMXOFF : : : "cc"); + write_cr4(read_cr4() & ~X86_CR4_VMXE); +} + +static inline int cpu_vmx_enabled(void) +{ + return read_cr4() & X86_CR4_VMXE; +} + +/** Disable VMX if it is enabled on the current CPU + * + * You shouldn't call this if cpu_has_vmx() returns 0. + */ +static inline void __cpu_emergency_vmxoff(void) +{ + if (cpu_vmx_enabled()) + cpu_vmxoff(); +} + +/** Disable VMX if it is supported and enabled on the current CPU + */ +static inline void cpu_emergency_vmxoff(void) +{ + if (cpu_has_vmx()) + __cpu_emergency_vmxoff(); +} + + + + +/* + * SVM functions: + */ + +/** Check if the CPU has SVM support + * + * You can use the 'msg' arg to get a message describing the problem, + * if the function returns zero. Simply pass NULL if you are not interested + * on the messages; gcc should take care of not generating code for + * the messages on this case. + */ +static inline int cpu_has_svm(const char **msg) +{ + uint32_t eax, ebx, ecx, edx; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + if (msg) + *msg = "not amd"; + return 0; + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + if (eax < SVM_CPUID_FUNC) { + if (msg) + *msg = "can't execute cpuid_8000000a"; + return 0; + } + + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { + if (msg) + *msg = "svm not available"; + return 0; + } + return 1; +} + + +/** Disable SVM on the current CPU + * + * You should call this only if cpu_has_svm() returned true. + */ +static inline void cpu_svm_disable(void) +{ + uint64_t efer; + + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~EFER_SVME); +} + +/** Makes sure SVM is disabled, if it is supported on the CPU + */ +static inline void cpu_emergency_svm_disable(void) +{ + if (cpu_has_svm(NULL)) + cpu_svm_disable(); +} + +#endif /* _ASM_X86_VIRTEX_H */ diff --git a/linux/include/asm-x86/vmx.h b/linux/include/asm-x86/vmx.h new file mode 100644 index 0000000..0174fa5 --- /dev/null +++ b/linux/include/asm-x86/vmx.h @@ -0,0 +1,469 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#ifndef VMX_H +#define VMX_H + +/* + * vmx.h: VMX Architecture related definitions + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * A few random additions are: + * Copyright (C) 2006 Qumranet + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + */ + +#include <linux/types.h> + +/* + * Definitions of Primary Processor-Based VM-Execution Controls. + */ +#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 +#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 +#define CPU_BASED_HLT_EXITING 0x00000080 +#define CPU_BASED_INVLPG_EXITING 0x00000200 +#define CPU_BASED_MWAIT_EXITING 0x00000400 +#define CPU_BASED_RDPMC_EXITING 0x00000800 +#define CPU_BASED_RDTSC_EXITING 0x00001000 +#define CPU_BASED_CR3_LOAD_EXITING 0x00008000 +#define CPU_BASED_CR3_STORE_EXITING 0x00010000 +#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 +#define CPU_BASED_CR8_STORE_EXITING 0x00100000 +#define CPU_BASED_TPR_SHADOW 0x00200000 +#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000 +#define CPU_BASED_MOV_DR_EXITING 0x00800000 +#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 +#define CPU_BASED_USE_IO_BITMAPS 0x02000000 +#define CPU_BASED_USE_MSR_BITMAPS 0x10000000 +#define CPU_BASED_MONITOR_EXITING 0x20000000 +#define CPU_BASED_PAUSE_EXITING 0x40000000 +#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 +/* + * Definitions of Secondary Processor-Based VM-Execution Controls. + */ +#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 +#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_RDTSCP 0x00000008 +#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 +#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 +#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 +#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 + + +#define PIN_BASED_EXT_INTR_MASK 0x00000001 +#define PIN_BASED_NMI_EXITING 0x00000008 +#define PIN_BASED_VIRTUAL_NMIS 0x00000020 + +#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 +#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 +#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 +#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 +#define VM_EXIT_SAVE_IA32_PAT 0x00040000 +#define VM_EXIT_LOAD_IA32_PAT 0x00080000 +#define VM_EXIT_SAVE_IA32_EFER 0x00100000 +#define VM_EXIT_LOAD_IA32_EFER 0x00200000 +#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 + +#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 +#define VM_ENTRY_IA32E_MODE 0x00000200 +#define VM_ENTRY_SMM 0x00000400 +#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 +#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 +#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 +#define VM_ENTRY_LOAD_IA32_EFER 0x00008000 + +/* VMCS Encodings */ +enum vmcs_field { + VIRTUAL_PROCESSOR_ID = 0x00000000, + GUEST_ES_SELECTOR = 0x00000800, + GUEST_CS_SELECTOR = 0x00000802, + GUEST_SS_SELECTOR = 0x00000804, + GUEST_DS_SELECTOR = 0x00000806, + GUEST_FS_SELECTOR = 0x00000808, + GUEST_GS_SELECTOR = 0x0000080a, + GUEST_LDTR_SELECTOR = 0x0000080c, + GUEST_TR_SELECTOR = 0x0000080e, + HOST_ES_SELECTOR = 0x00000c00, + HOST_CS_SELECTOR = 0x00000c02, + HOST_SS_SELECTOR = 0x00000c04, + HOST_DS_SELECTOR = 0x00000c06, + HOST_FS_SELECTOR = 0x00000c08, + HOST_GS_SELECTOR = 0x00000c0a, + HOST_TR_SELECTOR = 0x00000c0c, + IO_BITMAP_A = 0x00002000, + IO_BITMAP_A_HIGH = 0x00002001, + IO_BITMAP_B = 0x00002002, + IO_BITMAP_B_HIGH = 0x00002003, + MSR_BITMAP = 0x00002004, + MSR_BITMAP_HIGH = 0x00002005, + VM_EXIT_MSR_STORE_ADDR = 0x00002006, + VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, + VM_EXIT_MSR_LOAD_ADDR = 0x00002008, + VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, + VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, + VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + TSC_OFFSET = 0x00002010, + TSC_OFFSET_HIGH = 0x00002011, + VIRTUAL_APIC_PAGE_ADDR = 0x00002012, + VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, + APIC_ACCESS_ADDR = 0x00002014, + APIC_ACCESS_ADDR_HIGH = 0x00002015, + EPT_POINTER = 0x0000201a, + EPT_POINTER_HIGH = 0x0000201b, + GUEST_PHYSICAL_ADDRESS = 0x00002400, + GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, + VMCS_LINK_POINTER = 0x00002800, + VMCS_LINK_POINTER_HIGH = 0x00002801, + GUEST_IA32_DEBUGCTL = 0x00002802, + GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + GUEST_IA32_PAT = 0x00002804, + GUEST_IA32_PAT_HIGH = 0x00002805, + GUEST_IA32_EFER = 0x00002806, + GUEST_IA32_EFER_HIGH = 0x00002807, + GUEST_PDPTR0 = 0x0000280a, + GUEST_PDPTR0_HIGH = 0x0000280b, + GUEST_PDPTR1 = 0x0000280c, + GUEST_PDPTR1_HIGH = 0x0000280d, + GUEST_PDPTR2 = 0x0000280e, + GUEST_PDPTR2_HIGH = 0x0000280f, + GUEST_PDPTR3 = 0x00002810, + GUEST_PDPTR3_HIGH = 0x00002811, + HOST_IA32_PAT = 0x00002c00, + HOST_IA32_PAT_HIGH = 0x00002c01, + HOST_IA32_EFER = 0x00002c02, + HOST_IA32_EFER_HIGH = 0x00002c03, + PIN_BASED_VM_EXEC_CONTROL = 0x00004000, + CPU_BASED_VM_EXEC_CONTROL = 0x00004002, + EXCEPTION_BITMAP = 0x00004004, + PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, + PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, + CR3_TARGET_COUNT = 0x0000400a, + VM_EXIT_CONTROLS = 0x0000400c, + VM_EXIT_MSR_STORE_COUNT = 0x0000400e, + VM_EXIT_MSR_LOAD_COUNT = 0x00004010, + VM_ENTRY_CONTROLS = 0x00004012, + VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, + VM_ENTRY_INTR_INFO_FIELD = 0x00004016, + VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, + VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, + TPR_THRESHOLD = 0x0000401c, + SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + PLE_GAP = 0x00004020, + PLE_WINDOW = 0x00004022, + VM_INSTRUCTION_ERROR = 0x00004400, + VM_EXIT_REASON = 0x00004402, + VM_EXIT_INTR_INFO = 0x00004404, + VM_EXIT_INTR_ERROR_CODE = 0x00004406, + IDT_VECTORING_INFO_FIELD = 0x00004408, + IDT_VECTORING_ERROR_CODE = 0x0000440a, + VM_EXIT_INSTRUCTION_LEN = 0x0000440c, + VMX_INSTRUCTION_INFO = 0x0000440e, + GUEST_ES_LIMIT = 0x00004800, + GUEST_CS_LIMIT = 0x00004802, + GUEST_SS_LIMIT = 0x00004804, + GUEST_DS_LIMIT = 0x00004806, + GUEST_FS_LIMIT = 0x00004808, + GUEST_GS_LIMIT = 0x0000480a, + GUEST_LDTR_LIMIT = 0x0000480c, + GUEST_TR_LIMIT = 0x0000480e, + GUEST_GDTR_LIMIT = 0x00004810, + GUEST_IDTR_LIMIT = 0x00004812, + GUEST_ES_AR_BYTES = 0x00004814, + GUEST_CS_AR_BYTES = 0x00004816, + GUEST_SS_AR_BYTES = 0x00004818, + GUEST_DS_AR_BYTES = 0x0000481a, + GUEST_FS_AR_BYTES = 0x0000481c, + GUEST_GS_AR_BYTES = 0x0000481e, + GUEST_LDTR_AR_BYTES = 0x00004820, + GUEST_TR_AR_BYTES = 0x00004822, + GUEST_INTERRUPTIBILITY_INFO = 0x00004824, + GUEST_ACTIVITY_STATE = 0X00004826, + GUEST_SYSENTER_CS = 0x0000482A, + HOST_IA32_SYSENTER_CS = 0x00004c00, + CR0_GUEST_HOST_MASK = 0x00006000, + CR4_GUEST_HOST_MASK = 0x00006002, + CR0_READ_SHADOW = 0x00006004, + CR4_READ_SHADOW = 0x00006006, + CR3_TARGET_VALUE0 = 0x00006008, + CR3_TARGET_VALUE1 = 0x0000600a, + CR3_TARGET_VALUE2 = 0x0000600c, + CR3_TARGET_VALUE3 = 0x0000600e, + EXIT_QUALIFICATION = 0x00006400, + GUEST_LINEAR_ADDRESS = 0x0000640a, + GUEST_CR0 = 0x00006800, + GUEST_CR3 = 0x00006802, + GUEST_CR4 = 0x00006804, + GUEST_ES_BASE = 0x00006806, + GUEST_CS_BASE = 0x00006808, + GUEST_SS_BASE = 0x0000680a, + GUEST_DS_BASE = 0x0000680c, + GUEST_FS_BASE = 0x0000680e, + GUEST_GS_BASE = 0x00006810, + GUEST_LDTR_BASE = 0x00006812, + GUEST_TR_BASE = 0x00006814, + GUEST_GDTR_BASE = 0x00006816, + GUEST_IDTR_BASE = 0x00006818, + GUEST_DR7 = 0x0000681a, + GUEST_RSP = 0x0000681c, + GUEST_RIP = 0x0000681e, + GUEST_RFLAGS = 0x00006820, + GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, + GUEST_SYSENTER_ESP = 0x00006824, + GUEST_SYSENTER_EIP = 0x00006826, + HOST_CR0 = 0x00006c00, + HOST_CR3 = 0x00006c02, + HOST_CR4 = 0x00006c04, + HOST_FS_BASE = 0x00006c06, + HOST_GS_BASE = 0x00006c08, + HOST_TR_BASE = 0x00006c0a, + HOST_GDTR_BASE = 0x00006c0c, + HOST_IDTR_BASE = 0x00006c0e, + HOST_IA32_SYSENTER_ESP = 0x00006c10, + HOST_IA32_SYSENTER_EIP = 0x00006c12, + HOST_RSP = 0x00006c14, + HOST_RIP = 0x00006c16, +}; + +#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 +#define EXIT_REASON_TRIPLE_FAULT 2 + +#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 +#define EXIT_REASON_MCE_DURING_VMENTRY 41 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EPT_VIOLATION 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 + +/* + * Interruption-information format + */ +#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ +#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ +#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */ +#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ +#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000 + +#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK +#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK +#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK +#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK + +#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ +#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ +#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ +#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ + +/* GUEST_INTERRUPTIBILITY_INFO flags. */ +#define GUEST_INTR_STATE_STI 0x00000001 +#define GUEST_INTR_STATE_MOV_SS 0x00000002 +#define GUEST_INTR_STATE_SMI 0x00000004 +#define GUEST_INTR_STATE_NMI 0x00000008 + +/* GUEST_ACTIVITY_STATE flags */ +#define GUEST_ACTIVITY_ACTIVE 0 +#define GUEST_ACTIVITY_HLT 1 +#define GUEST_ACTIVITY_SHUTDOWN 2 +#define GUEST_ACTIVITY_WAIT_SIPI 3 + +/* + * Exit Qualifications for MOV for Control Register Access + */ +#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/ +#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ +#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */ +#define LMSW_SOURCE_DATA_SHIFT 16 +#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ +#define REG_EAX (0 << 8) +#define REG_ECX (1 << 8) +#define REG_EDX (2 << 8) +#define REG_EBX (3 << 8) +#define REG_ESP (4 << 8) +#define REG_EBP (5 << 8) +#define REG_ESI (6 << 8) +#define REG_EDI (7 << 8) +#define REG_R8 (8 << 8) +#define REG_R9 (9 << 8) +#define REG_R10 (10 << 8) +#define REG_R11 (11 << 8) +#define REG_R12 (12 << 8) +#define REG_R13 (13 << 8) +#define REG_R14 (14 << 8) +#define REG_R15 (15 << 8) + +/* + * Exit Qualifications for MOV for Debug Register Access + */ +#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */ +#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ +#define TYPE_MOV_TO_DR (0 << 4) +#define TYPE_MOV_FROM_DR (1 << 4) +#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */ + + +/* segment AR */ +#define SEGMENT_AR_L_MASK (1 << 13) + +#define AR_TYPE_ACCESSES_MASK 1 +#define AR_TYPE_READABLE_MASK (1 << 1) +#define AR_TYPE_WRITEABLE_MASK (1 << 2) +#define AR_TYPE_CODE_MASK (1 << 3) +#define AR_TYPE_MASK 0x0f +#define AR_TYPE_BUSY_64_TSS 11 +#define AR_TYPE_BUSY_32_TSS 11 +#define AR_TYPE_BUSY_16_TSS 3 +#define AR_TYPE_LDT 2 + +#define AR_UNUSABLE_MASK (1 << 16) +#define AR_S_MASK (1 << 4) +#define AR_P_MASK (1 << 7) +#define AR_L_MASK (1 << 13) +#define AR_DB_MASK (1 << 14) +#define AR_G_MASK (1 << 15) +#define AR_DPL_SHIFT 5 +#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) + +#define AR_RESERVD_MASK 0xfffe0f00 + +#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) +#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) +#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) + +#define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 +#define VMX_VPID_EXTENT_ALL_CONTEXT 2 + +#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 +#define VMX_EPT_EXTENT_CONTEXT 1 +#define VMX_EPT_EXTENT_GLOBAL 2 + +#define VMX_EPT_EXECUTE_ONLY_BIT (1ull) +#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) +#define VMX_EPTP_UC_BIT (1ull << 8) +#define VMX_EPTP_WB_BIT (1ull << 14) +#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) +#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) +#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) +#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) + +#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ +#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ + +#define VMX_EPT_DEFAULT_GAW 3 +#define VMX_EPT_MAX_GAW 0x4 +#define VMX_EPT_MT_EPTE_SHIFT 3 +#define VMX_EPT_GAW_EPTP_SHIFT 3 +#define VMX_EPT_DEFAULT_MT 0x6ull +#define VMX_EPT_READABLE_MASK 0x1ull +#define VMX_EPT_WRITABLE_MASK 0x2ull +#define VMX_EPT_EXECUTABLE_MASK 0x4ull +#define VMX_EPT_IPAT_BIT (1ull << 6) + +#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul + + +#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" +#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" +#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" +#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" +#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" +#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" +#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" +#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" +#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" + +struct vmx_msr_entry { + u32 index; + u32 reserved; + u64 value; +} __aligned(16); + +#endif diff --git a/linux/include/linux/kvm.h b/linux/include/linux/kvm.h new file mode 100644 index 0000000..88fccc0 --- /dev/null +++ b/linux/include/linux/kvm.h @@ -0,0 +1,838 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#ifndef __LINUX_KVM_H +#define __LINUX_KVM_H + +/* + * Userspace interface for /dev/kvm - kernel based virtual machine + * + * Note: you must update KVM_API_VERSION if you change this interface. + */ + +#include <asm/types.h> + +#include <linux/ioctl.h> +#include <asm/kvm.h> + +#define KVM_API_VERSION 12 + +/* *** Deprecated interfaces *** */ + +#define KVM_TRC_SHIFT 16 + +#define KVM_TRC_ENTRYEXIT (1 << KVM_TRC_SHIFT) +#define KVM_TRC_HANDLER (1 << (KVM_TRC_SHIFT + 1)) + +#define KVM_TRC_VMENTRY (KVM_TRC_ENTRYEXIT + 0x01) +#define KVM_TRC_VMEXIT (KVM_TRC_ENTRYEXIT + 0x02) +#define KVM_TRC_PAGE_FAULT (KVM_TRC_HANDLER + 0x01) + +#define KVM_TRC_HEAD_SIZE 12 +#define KVM_TRC_CYCLE_SIZE 8 +#define KVM_TRC_EXTRA_MAX 7 + +#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) +#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) +#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) +#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05) +#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06) +#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07) +#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08) +#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09) +#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A) +#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B) +#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C) +#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D) +#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E) +#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F) +#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10) +#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11) +#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12) +#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13) +#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14) +#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15) +#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16) +#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17) +#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18) +#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19) + +struct kvm_user_trace_setup { + __u32 buf_size; + __u32 buf_nr; +}; + +#define __KVM_DEPRECATED_MAIN_W_0x06 \ + _IOW(KVMIO, 0x06, struct kvm_user_trace_setup) +#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07) +#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08) + +#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq) + +struct kvm_breakpoint { + __u32 enabled; + __u32 padding; + __u64 address; +}; + +struct kvm_debug_guest { + __u32 enabled; + __u32 pad; + struct kvm_breakpoint breakpoints[4]; + __u32 singlestep; +}; + +#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest) + +/* *** End of deprecated interfaces *** */ + + +/* for KVM_CREATE_MEMORY_REGION */ +struct kvm_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ +}; + +/* for KVM_SET_USER_MEMORY_REGION */ +struct kvm_userspace_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ +}; + +/* for kvm_memory_region::flags */ +#define KVM_MEM_LOG_DIRTY_PAGES 1UL +#define KVM_MEMSLOT_INVALID (1UL << 1) + +/* for KVM_IRQ_LINE */ +struct kvm_irq_level { + /* + * ACPI gsi notion of irq. + * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. + * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. + */ + union { + __u32 irq; + __s32 status; + }; + __u32 level; +}; + + +struct kvm_irqchip { + __u32 chip_id; + __u32 pad; + union { + char dummy[512]; /* reserving space */ +#ifdef __KVM_HAVE_PIT + struct kvm_pic_state pic; +#endif +#ifdef __KVM_HAVE_IOAPIC + struct kvm_ioapic_state ioapic; +#endif + } chip; +}; + +/* for KVM_CREATE_PIT2 */ +struct kvm_pit_config { + __u32 flags; + __u32 pad[15]; +}; + +#define KVM_PIT_SPEAKER_DUMMY 1 + +#define KVM_EXIT_UNKNOWN 0 +#define KVM_EXIT_EXCEPTION 1 +#define KVM_EXIT_IO 2 +#define KVM_EXIT_HYPERCALL 3 +#define KVM_EXIT_DEBUG 4 +#define KVM_EXIT_HLT 5 +#define KVM_EXIT_MMIO 6 +#define KVM_EXIT_IRQ_WINDOW_OPEN 7 +#define KVM_EXIT_SHUTDOWN 8 +#define KVM_EXIT_FAIL_ENTRY 9 +#define KVM_EXIT_INTR 10 +#define KVM_EXIT_SET_TPR 11 +#define KVM_EXIT_TPR_ACCESS 12 +#define KVM_EXIT_S390_SIEIC 13 +#define KVM_EXIT_S390_RESET 14 +#define KVM_EXIT_DCR 15 +#define KVM_EXIT_NMI 16 +#define KVM_EXIT_INTERNAL_ERROR 17 +#define KVM_EXIT_OSI 18 + +/* For KVM_EXIT_INTERNAL_ERROR */ +#define KVM_INTERNAL_ERROR_EMULATION 1 +#define KVM_INTERNAL_ERROR_SIMUL_EX 2 + +/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ +struct kvm_run { + /* in */ + __u8 request_interrupt_window; + __u8 padding1[7]; + + /* out */ + __u32 exit_reason; + __u8 ready_for_interrupt_injection; + __u8 if_flag; + __u8 padding2[2]; + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + __u64 apic_base; + +#ifdef __KVM_S390 + /* the processor status word for s390 */ + __u64 psw_mask; /* psw upper half */ + __u64 psw_addr; /* psw lower half */ +#endif + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + struct { + struct kvm_debug_exit_arch arch; + } debug; + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + /* KVM_EXIT_S390_SIEIC */ + struct { + __u8 icptcode; + __u16 ipa; + __u32 ipb; + } s390_sieic; + /* KVM_EXIT_S390_RESET */ +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + __u64 s390_reset_flags; + /* KVM_EXIT_DCR */ + struct { + __u32 dcrn; + __u32 data; + __u8 is_write; + } dcr; + struct { + __u32 suberror; + /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */ + __u32 ndata; + __u64 data[16]; + } internal; + /* KVM_EXIT_OSI */ + struct { + __u64 gprs[32]; + } osi; + /* Fix the size of the union. */ + char padding[256]; + }; +}; + +/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */ + +struct kvm_coalesced_mmio_zone { + __u64 addr; + __u32 size; + __u32 pad; +}; + +struct kvm_coalesced_mmio { + __u64 phys_addr; + __u32 len; + __u32 pad; + __u8 data[8]; +}; + +struct kvm_coalesced_mmio_ring { + __u32 first, last; + struct kvm_coalesced_mmio coalesced_mmio[0]; +}; + +#define KVM_COALESCED_MMIO_MAX \ + ((PAGE_SIZE - sizeof(struct kvm_coalesced_mmio_ring)) / \ + sizeof(struct kvm_coalesced_mmio)) + +/* for KVM_TRANSLATE */ +struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; +}; + +/* for KVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 irq; +}; + +/* for KVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding1; + union { + void *dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; +}; + +/* for KVM_TPR_ACCESS_REPORTING */ +struct kvm_tpr_access_ctl { + __u32 enabled; + __u32 flags; + __u32 reserved[8]; +}; + +/* for KVM_SET_VAPIC_ADDR */ +struct kvm_vapic_addr { + __u64 vapic_addr; +}; + +/* for KVM_SET_MPSTATE */ + +#define KVM_MP_STATE_RUNNABLE 0 +#define KVM_MP_STATE_UNINITIALIZED 1 +#define KVM_MP_STATE_INIT_RECEIVED 2 +#define KVM_MP_STATE_HALTED 3 +#define KVM_MP_STATE_SIPI_RECEIVED 4 + +struct kvm_mp_state { + __u32 mp_state; +}; + +struct kvm_s390_psw { + __u64 mask; + __u64 addr; +}; + +/* valid values for type in kvm_s390_interrupt */ +#define KVM_S390_SIGP_STOP 0xfffe0000u +#define KVM_S390_PROGRAM_INT 0xfffe0001u +#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u +#define KVM_S390_RESTART 0xfffe0003u +#define KVM_S390_INT_VIRTIO 0xffff2603u +#define KVM_S390_INT_SERVICE 0xffff2401u +#define KVM_S390_INT_EMERGENCY 0xffff1201u + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +/* for KVM_SET_GUEST_DEBUG */ + +#define KVM_GUESTDBG_ENABLE 0x00000001 +#define KVM_GUESTDBG_SINGLESTEP 0x00000002 + +struct kvm_guest_debug { + __u32 control; + __u32 pad; + struct kvm_guest_debug_arch arch; +}; + +enum { + kvm_ioeventfd_flag_nr_datamatch, + kvm_ioeventfd_flag_nr_pio, + kvm_ioeventfd_flag_nr_deassign, + kvm_ioeventfd_flag_nr_max, +}; + +#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) +#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) +#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) + +#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) + +struct kvm_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 1, 2, 4, or 8 bytes */ + __s32 fd; + __u32 flags; + __u8 pad[36]; +}; + +/* for KVM_ENABLE_CAP */ +struct kvm_enable_cap { + /* in */ + __u32 cap; + __u32 flags; + __u64 args[4]; + __u8 pad[64]; +}; + +/* for KVM_PPC_GET_PVINFO */ +struct kvm_ppc_pvinfo { + /* out */ + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +#define KVMIO 0xAE + +/* + * ioctls for /dev/kvm fds: + */ +#define KVM_GET_API_VERSION _IO(KVMIO, 0x00) +#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ +#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 0x02, struct kvm_msr_list) + +#define KVM_S390_ENABLE_SIE _IO(KVMIO, 0x06) +/* + * Check if a kvm extension is available. Argument is extension number, + * return is 1 (yes) or 0 (no, sorry). + */ +#define KVM_CHECK_EXTENSION _IO(KVMIO, 0x03) +/* + * Get size for mmap(vcpu_fd) + */ +#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ +#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid2) +#define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06 +#define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07 +#define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08 + +/* + * Extension capability list. + */ +#define KVM_CAP_IRQCHIP 0 +#define KVM_CAP_HLT 1 +#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2 +#define KVM_CAP_USER_MEMORY 3 +#define KVM_CAP_SET_TSS_ADDR 4 +#define KVM_CAP_VAPIC 6 +#define KVM_CAP_EXT_CPUID 7 +#define KVM_CAP_CLOCKSOURCE 8 +#define KVM_CAP_NR_VCPUS 9 /* returns max vcpus per vm */ +#define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */ +#define KVM_CAP_PIT 11 +#define KVM_CAP_NOP_IO_DELAY 12 +#define KVM_CAP_PV_MMU 13 +#define KVM_CAP_MP_STATE 14 +#define KVM_CAP_COALESCED_MMIO 15 +#define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT +#define KVM_CAP_DEVICE_ASSIGNMENT 17 +#endif +#define KVM_CAP_IOMMU 18 +#ifdef __KVM_HAVE_MSI +#define KVM_CAP_DEVICE_MSI 20 +#endif +/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ +#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 +#ifdef __KVM_HAVE_USER_NMI +#define KVM_CAP_USER_NMI 22 +#endif +#ifdef __KVM_HAVE_GUEST_DEBUG +#define KVM_CAP_SET_GUEST_DEBUG 23 +#endif +#ifdef __KVM_HAVE_PIT +#define KVM_CAP_REINJECT_CONTROL 24 +#endif +#ifdef __KVM_HAVE_IOAPIC +#define KVM_CAP_IRQ_ROUTING 25 +#endif +#define KVM_CAP_IRQ_INJECT_STATUS 26 +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT +#define KVM_CAP_DEVICE_DEASSIGNMENT 27 +#endif +#ifdef __KVM_HAVE_MSIX +#define KVM_CAP_DEVICE_MSIX 28 +#endif +#define KVM_CAP_ASSIGN_DEV_IRQ 29 +/* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */ +#define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30 +#ifdef __KVM_HAVE_MCE +#define KVM_CAP_MCE 31 +#endif +#define KVM_CAP_IRQFD 32 +#ifdef __KVM_HAVE_PIT +#define KVM_CAP_PIT2 33 +#endif +#define KVM_CAP_SET_BOOT_CPU_ID 34 +#ifdef __KVM_HAVE_PIT_STATE2 +#define KVM_CAP_PIT_STATE2 35 +#endif +#define KVM_CAP_IOEVENTFD 36 +#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37 +#ifdef __KVM_HAVE_XEN_HVM +#define KVM_CAP_XEN_HVM 38 +#endif +#define KVM_CAP_ADJUST_CLOCK 39 +#define KVM_CAP_INTERNAL_ERROR_DATA 40 +#ifdef __KVM_HAVE_VCPU_EVENTS +#define KVM_CAP_VCPU_EVENTS 41 +#endif +#define KVM_CAP_S390_PSW 42 +#define KVM_CAP_PPC_SEGSTATE 43 +#define KVM_CAP_HYPERV 44 +#define KVM_CAP_HYPERV_VAPIC 45 +#define KVM_CAP_HYPERV_SPIN 46 +#define KVM_CAP_PCI_SEGMENT 47 +#define KVM_CAP_PPC_PAIRED_SINGLES 48 +#define KVM_CAP_INTR_SHADOW 49 +#ifdef __KVM_HAVE_DEBUGREGS +#define KVM_CAP_DEBUGREGS 50 +#endif +#define KVM_CAP_X86_ROBUST_SINGLESTEP 51 +#define KVM_CAP_PPC_OSI 52 +#define KVM_CAP_PPC_UNSET_IRQ 53 +#define KVM_CAP_ENABLE_CAP 54 +#ifdef __KVM_HAVE_XSAVE +#define KVM_CAP_XSAVE 55 +#endif +#ifdef __KVM_HAVE_XCRS +#define KVM_CAP_XCRS 56 +#endif +#define KVM_CAP_PPC_GET_PVINFO 57 +#define KVM_CAP_PPC_IRQ_LEVEL 58 +#define KVM_CAP_ASYNC_PF 59 + +#ifdef KVM_CAP_IRQ_ROUTING + +struct kvm_irq_routing_irqchip { + __u32 irqchip; + __u32 pin; +}; + +struct kvm_irq_routing_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 pad; +}; + +/* gsi routing entry types */ +#define KVM_IRQ_ROUTING_IRQCHIP 1 +#define KVM_IRQ_ROUTING_MSI 2 + +struct kvm_irq_routing_entry { + __u32 gsi; + __u32 type; + __u32 flags; + __u32 pad; + union { + struct kvm_irq_routing_irqchip irqchip; + struct kvm_irq_routing_msi msi; + __u32 pad[8]; + } u; +}; + +struct kvm_irq_routing { + __u32 nr; + __u32 flags; + struct kvm_irq_routing_entry entries[0]; +}; + +#endif + +#ifdef KVM_CAP_MCE +/* x86 MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; +#endif + +#ifdef KVM_CAP_XEN_HVM +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; +#endif + +#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) + +struct kvm_irqfd { + __u32 fd; + __u32 gsi; + __u32 flags; + __u8 pad[20]; +}; + +struct kvm_clock_data { + __u64 clock; + __u32 flags; + __u32 pad[9]; +}; + +/* + * ioctls for VM fds + */ +#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) +/* + * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns + * a vcpu fd. + */ +#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) +#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) +/* KVM_SET_MEMORY_ALIAS is obsolete: */ +#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) +#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) +#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) +#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ + struct kvm_userspace_memory_region) +#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) +#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +/* Device model IOC */ +#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) +#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) +#define KVM_GET_IRQCHIP _IOWR(KVMIO, 0x62, struct kvm_irqchip) +#define KVM_SET_IRQCHIP _IOR(KVMIO, 0x63, struct kvm_irqchip) +#define KVM_CREATE_PIT _IO(KVMIO, 0x64) +#define KVM_GET_PIT _IOWR(KVMIO, 0x65, struct kvm_pit_state) +#define KVM_SET_PIT _IOR(KVMIO, 0x66, struct kvm_pit_state) +#define KVM_IRQ_LINE_STATUS _IOWR(KVMIO, 0x67, struct kvm_irq_level) +#define KVM_REGISTER_COALESCED_MMIO \ + _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) +#define KVM_UNREGISTER_COALESCED_MMIO \ + _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) +#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \ + struct kvm_assigned_pci_dev) +#define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing) +/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */ +#define KVM_ASSIGN_IRQ __KVM_DEPRECATED_VM_R_0x70 +#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq) +#define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) +#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \ + struct kvm_assigned_pci_dev) +#define KVM_ASSIGN_SET_MSIX_NR _IOW(KVMIO, 0x73, \ + struct kvm_assigned_msix_nr) +#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO, 0x74, \ + struct kvm_assigned_msix_entry) +#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) +#define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) +#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) +#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) +#define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd) +#define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) +#define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) +#define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +/* Available with KVM_CAP_PIT_STATE2 */ +#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) +#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) +/* Available with KVM_CAP_PPC_GET_PVINFO */ +#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo) + +/* + * ioctls for vcpu fds + */ +#define KVM_RUN _IO(KVMIO, 0x80) +#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) +#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) +#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs) +#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) +#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) +#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) +/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */ +#define KVM_DEBUG_GUEST __KVM_DEPRECATED_VCPU_W_0x87 +#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) +#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) +#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) +#define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask) +#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu) +#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) +#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) +#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) +#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2) +#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2) +/* Available with KVM_CAP_VAPIC */ +#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl) +/* Available with KVM_CAP_VAPIC */ +#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr) +/* valid for virtual machine (for floating interrupt)_and_ vcpu */ +#define KVM_S390_INTERRUPT _IOW(KVMIO, 0x94, struct kvm_s390_interrupt) +/* store status for s390 */ +#define KVM_S390_STORE_STATUS_NOADDR (-1ul) +#define KVM_S390_STORE_STATUS_PREFIXED (-2ul) +#define KVM_S390_STORE_STATUS _IOW(KVMIO, 0x95, unsigned long) +/* initial ipl psw for s390 */ +#define KVM_S390_SET_INITIAL_PSW _IOW(KVMIO, 0x96, struct kvm_s390_psw) +/* initial reset for s390 */ +#define KVM_S390_INITIAL_RESET _IO(KVMIO, 0x97) +#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state) +#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) +/* Available with KVM_CAP_NMI */ +#define KVM_NMI _IO(KVMIO, 0x9a) +/* Available with KVM_CAP_SET_GUEST_DEBUG */ +#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) +/* MCE for x86 */ +#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64) +#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64) +#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce) +/* IA64 stack access */ +#define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *) +#define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) +/* Available with KVM_CAP_VCPU_EVENTS */ +#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) +#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) +/* Available with KVM_CAP_DEBUGREGS */ +#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) +#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) +#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) +/* Available with KVM_CAP_XSAVE */ +#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) +#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) +/* Available with KVM_CAP_XCRS */ +#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) +#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) + +#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + +struct kvm_assigned_pci_dev { + __u32 assigned_dev_id; + __u32 busnr; + __u32 devfn; + __u32 flags; + __u32 segnr; + union { + __u32 reserved[11]; + }; +}; + +#define KVM_DEV_IRQ_HOST_INTX (1 << 0) +#define KVM_DEV_IRQ_HOST_MSI (1 << 1) +#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) + +#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) +#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) +#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) + +#define KVM_DEV_IRQ_HOST_MASK 0x00ff +#define KVM_DEV_IRQ_GUEST_MASK 0xff00 + +struct kvm_assigned_irq { + __u32 assigned_dev_id; + __u32 host_irq; + __u32 guest_irq; + __u32 flags; + union { + struct { + __u32 addr_lo; + __u32 addr_hi; + __u32 data; + } guest_msi; + __u32 reserved[12]; + }; +}; + + +struct kvm_assigned_msix_nr { + __u32 assigned_dev_id; + __u16 entry_nr; + __u16 padding; +}; + +#define KVM_MAX_MSIX_PER_DEV 256 +struct kvm_assigned_msix_entry { + __u32 assigned_dev_id; + __u32 gsi; + __u16 entry; /* The index of entry in the MSI-X table */ + __u16 padding[3]; +}; + +#endif /* __LINUX_KVM_H */ diff --git a/linux/include/linux/kvm_host.h b/linux/include/linux/kvm_host.h new file mode 100644 index 0000000..84546bd --- /dev/null +++ b/linux/include/linux/kvm_host.h @@ -0,0 +1,778 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#ifndef __KVM_HOST_H +#define __KVM_HOST_H + +/* + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/types.h> +#include <linux/hardirq.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/preempt.h> +#include <linux/msi.h> +#include <linux/slab.h> +#include <linux/rcupdate.h> +#include <asm/signal.h> + +#include <linux/kvm.h> +#include <linux/kvm_para.h> + +#include <linux/kvm_types.h> + +#include <asm/kvm_host.h> + +/* + * vcpu->requests bit members + */ +#define KVM_REQ_TLB_FLUSH 0 +#define KVM_REQ_MIGRATE_TIMER 1 +#define KVM_REQ_REPORT_TPR_ACCESS 2 +#define KVM_REQ_MMU_RELOAD 3 +#define KVM_REQ_TRIPLE_FAULT 4 +#define KVM_REQ_PENDING_TIMER 5 +#define KVM_REQ_UNHALT 6 +#define KVM_REQ_MMU_SYNC 7 +#define KVM_REQ_CLOCK_UPDATE 8 +#define KVM_REQ_KICK 9 +#define KVM_REQ_DEACTIVATE_FPU 10 +#define KVM_REQ_EVENT 11 +#define KVM_REQ_APF_HALT 12 + +#define KVM_USERSPACE_IRQ_SOURCE_ID 0 + +struct kvm; +struct kvm_vcpu; +extern struct kmem_cache *kvm_vcpu_cache; + +/* + * It would be nice to use something smarter than a linear search, TBD... + * Thankfully we dont expect many devices to register (famous last words :), + * so until then it will suffice. At least its abstracted so we can change + * in one place. + */ +struct kvm_io_bus { + int dev_count; +#define NR_IOBUS_DEVS 200 + struct kvm_io_device *devs[NR_IOBUS_DEVS]; +}; + +enum kvm_bus { + KVM_MMIO_BUS, + KVM_PIO_BUS, + KVM_NR_BUSES +}; + +int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, const void *val); +int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, + void *val); +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev); +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev); + +#ifdef CONFIG_KVM_ASYNC_PF +struct kvm_async_pf { + struct work_struct work; + struct list_head link; + struct list_head queue; + struct kvm_vcpu *vcpu; + struct mm_struct *mm; + gva_t gva; + unsigned long addr; + struct kvm_arch_async_pf arch; + struct page *page; + bool done; +}; + +void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); +void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, + struct kvm_arch_async_pf *arch); +int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); +#endif + +struct kvm_vcpu { + struct kvm *kvm; +#ifdef CONFIG_PREEMPT_NOTIFIERS + struct preempt_notifier preempt_notifier; +#endif + int vcpu_id; + struct mutex mutex; + int cpu; + atomic_t guest_mode; + struct kvm_run *run; + unsigned long requests; + unsigned long guest_debug; + int srcu_idx; + + int fpu_active; + int guest_fpu_loaded, guest_xcr0_loaded; + wait_queue_head_t wq; + int sigset_active; + sigset_t sigset; + struct kvm_vcpu_stat stat; + +#ifdef CONFIG_HAS_IOMEM + int mmio_needed; + int mmio_read_completed; + int mmio_is_write; + int mmio_size; + unsigned char mmio_data[8]; + gpa_t mmio_phys_addr; +#endif + +#ifdef CONFIG_KVM_ASYNC_PF + struct { + u32 queued; + struct list_head queue; + struct list_head done; + spinlock_t lock; + } async_pf; +#endif + + struct kvm_vcpu_arch arch; +}; + +/* + * Some of the bitops functions do not support too long bitmaps. + * This number must be determined not to exceed such limits. + */ +#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) + +struct kvm_lpage_info { + unsigned long rmap_pde; + int write_count; +}; + +struct kvm_memory_slot { + gfn_t base_gfn; + unsigned long npages; + unsigned long flags; + unsigned long *rmap; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_head; + struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; + unsigned long userspace_addr; + int user_alloc; + int id; +}; + +static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) +{ + return ALIGN(memslot->npages, BITS_PER_LONG) / 8; +} + +struct kvm_kernel_irq_routing_entry { + u32 gsi; + u32 type; + int (*set)(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level); + union { + struct { + unsigned irqchip; + unsigned pin; + } irqchip; + struct msi_msg msi; + }; + struct hlist_node link; +}; + +#ifdef __KVM_HAVE_IOAPIC + +struct kvm_irq_routing_table { + int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS]; + struct kvm_kernel_irq_routing_entry *rt_entries; + u32 nr_rt_entries; + /* + * Array indexed by gsi. Each entry contains list of irq chips + * the gsi is connected to. + */ + struct hlist_head map[0]; +}; + +#else + +struct kvm_irq_routing_table {}; + +#endif + +struct kvm_memslots { + int nmemslots; + u64 generation; + struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + + KVM_PRIVATE_MEM_SLOTS]; +}; + +struct kvm { + spinlock_t mmu_lock; + raw_spinlock_t requests_lock; + struct mutex slots_lock; + struct mm_struct *mm; /* userspace tied to this vm */ + struct kvm_memslots *memslots; + struct srcu_struct srcu; +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + u32 bsp_vcpu_id; + struct kvm_vcpu *bsp_vcpu; +#endif + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + atomic_t online_vcpus; + struct list_head vm_list; + struct mutex lock; + struct kvm_io_bus *buses[KVM_NR_BUSES]; +#ifdef CONFIG_HAVE_KVM_EVENTFD + struct { + spinlock_t lock; + struct list_head items; + } irqfds; + struct list_head ioeventfds; +#endif + struct kvm_vm_stat stat; + struct kvm_arch arch; + atomic_t users_count; +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + struct kvm_coalesced_mmio_dev *coalesced_mmio_dev; + struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; +#endif + + struct mutex irq_lock; +#ifdef CONFIG_HAVE_KVM_IRQCHIP + /* + * Update side is protected by irq_lock and, + * if configured, irqfds.lock. + */ + struct kvm_irq_routing_table __rcu *irq_routing; + struct hlist_head mask_notifier_list; + struct hlist_head irq_ack_notifier_list; +#endif + +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER + struct mmu_notifier mmu_notifier; + unsigned long mmu_notifier_seq; + long mmu_notifier_count; +#endif + long tlbs_dirty; +}; + +/* The guest did something we don't support. */ +#define pr_unimpl(vcpu, fmt, ...) \ + do { \ + if (printk_ratelimit()) \ + printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ + current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ + } while (0) + +#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) +#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) + +static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) +{ + smp_rmb(); + return kvm->vcpus[i]; +} + +#define kvm_for_each_vcpu(idx, vcpup, kvm) \ + for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \ + idx < atomic_read(&kvm->online_vcpus) && vcpup; \ + vcpup = kvm_get_vcpu(kvm, ++idx)) + +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); + +void vcpu_load(struct kvm_vcpu *vcpu); +void vcpu_put(struct kvm_vcpu *vcpu); + +int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, + struct module *module); +void kvm_exit(void); + +void kvm_get_kvm(struct kvm *kvm); +void kvm_put_kvm(struct kvm *kvm); + +static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) +{ + return rcu_dereference_check(kvm->memslots, + srcu_read_lock_held(&kvm->srcu) + || lockdep_is_held(&kvm->slots_lock)); +} + +#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) +#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) +static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } + +extern struct page *bad_page; +extern pfn_t bad_pfn; + +int is_error_page(struct page *page); +int is_error_pfn(pfn_t pfn); +int is_hwpoison_pfn(pfn_t pfn); +int is_fault_pfn(pfn_t pfn); +int kvm_is_error_hva(unsigned long addr); +int kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc); +int __kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc); +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc); +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc); +void kvm_disable_largepages(void); +void kvm_arch_flush_shadow(struct kvm *kvm); + +int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, + int nr_pages); + +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); +void kvm_release_page_clean(struct page *page); +void kvm_release_page_dirty(struct page *page); +void kvm_set_page_dirty(struct page *page); +void kvm_set_page_accessed(struct page *page); + +pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); +pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, + bool write_fault, bool *writable); +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, + bool *writable); +pfn_t gfn_to_pfn_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); +int memslot_id(struct kvm *kvm, gfn_t gfn); +void kvm_release_pfn_dirty(pfn_t); +void kvm_release_pfn_clean(pfn_t pfn); +void kvm_set_pfn_dirty(pfn_t pfn); +void kvm_set_pfn_accessed(pfn_t pfn); +void kvm_get_pfn(pfn_t pfn); + +int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, + int len); +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len); +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); +int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, + int offset, int len); +int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, + unsigned long len); +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len); +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa); +int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); +int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); +int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); +unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); +void mark_page_dirty(struct kvm *kvm, gfn_t gfn); +void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, + gfn_t gfn); + +void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); +void kvm_resched(struct kvm_vcpu *vcpu); +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); + +void kvm_flush_remote_tlbs(struct kvm *kvm); +void kvm_reload_remote_mmus(struct kvm *kvm); + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); + +int kvm_dev_ioctl_check_extension(long ext); + +int kvm_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log, int *is_dirty); +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log); + +int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct + kvm_userspace_memory_region *mem, + int user_alloc); +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr); + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs); +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs); +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state); +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state); +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg); +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); + +int kvm_arch_init(void *opaque); +void kvm_arch_exit(void); + +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); + +int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); +int kvm_arch_hardware_enable(void *garbage); +void kvm_arch_hardware_disable(void *garbage); +int kvm_arch_hardware_setup(void); +void kvm_arch_hardware_unsetup(void); +void kvm_arch_check_processor_compat(void *rtn); +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); + +void kvm_free_physmem(struct kvm *kvm); + +#ifndef __KVM_HAVE_ARCH_VM_ALLOC +static inline struct kvm *kvm_arch_alloc_vm(void) +{ + return kzalloc(sizeof(struct kvm), GFP_KERNEL); +} + +static inline void kvm_arch_free_vm(struct kvm *kvm) +{ + kfree(kvm); +} +#endif + +int kvm_arch_init_vm(struct kvm *kvm); +void kvm_arch_destroy_vm(struct kvm *kvm); +void kvm_free_all_assigned_devices(struct kvm *kvm); +void kvm_arch_sync_events(struct kvm *kvm); + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); +void kvm_vcpu_kick(struct kvm_vcpu *vcpu); + +int kvm_is_mmio_pfn(pfn_t pfn); + +struct kvm_irq_ack_notifier { + struct hlist_node link; + unsigned gsi; + void (*irq_acked)(struct kvm_irq_ack_notifier *kian); +}; + +struct kvm_assigned_dev_kernel { + struct kvm_irq_ack_notifier ack_notifier; + struct list_head list; + int assigned_dev_id; + int host_segnr; + int host_busnr; + int host_devfn; + unsigned int entries_nr; + int host_irq; + bool host_irq_disabled; + struct msix_entry *host_msix_entries; + int guest_irq; + struct msix_entry *guest_msix_entries; + unsigned long irq_requested_type; + int irq_source_id; + int flags; + struct pci_dev *dev; + struct kvm *kvm; + spinlock_t intx_lock; + char irq_name[32]; +}; + +struct kvm_irq_mask_notifier { + void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); + int irq; + struct hlist_node link; +}; + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask); + +#ifdef __KVM_HAVE_IOAPIC +void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, + union kvm_ioapic_redirect_entry *entry, + unsigned long *deliver_bitmask); +#endif +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, + int irq_source_id, int level); +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); +int kvm_request_irq_source_id(struct kvm *kvm); +void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); + +/* For vcpu->arch.iommu_flags */ +#define KVM_IOMMU_CACHE_COHERENCY 0x1 + +#ifdef CONFIG_IOMMU_API +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); +int kvm_iommu_map_guest(struct kvm *kvm); +int kvm_iommu_unmap_guest(struct kvm *kvm); +int kvm_assign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev); +int kvm_deassign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev); +#else /* CONFIG_IOMMU_API */ +static inline int kvm_iommu_map_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + return 0; +} + +static inline int kvm_iommu_map_guest(struct kvm *kvm) +{ + return -ENODEV; +} + +static inline int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + return 0; +} + +static inline int kvm_assign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + return 0; +} + +static inline int kvm_deassign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + return 0; +} +#endif /* CONFIG_IOMMU_API */ + +static inline void kvm_guest_enter(void) +{ + account_system_vtime(current); + current->flags |= PF_VCPU; +} + +static inline void kvm_guest_exit(void) +{ + account_system_vtime(current); + current->flags &= ~PF_VCPU; +} + +static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, + gfn_t gfn) +{ + return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; +} + +static inline gpa_t gfn_to_gpa(gfn_t gfn) +{ + return (gpa_t)gfn << PAGE_SHIFT; +} + +static inline gfn_t gpa_to_gfn(gpa_t gpa) +{ + return (gfn_t)(gpa >> PAGE_SHIFT); +} + +static inline hpa_t pfn_to_hpa(pfn_t pfn) +{ + return (hpa_t)pfn << PAGE_SHIFT; +} + +static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) +{ + set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); +} + +enum kvm_stat_kind { + KVM_STAT_VM, + KVM_STAT_VCPU, +}; + +struct kvm_stats_debugfs_item { + const char *name; + int offset; + enum kvm_stat_kind kind; + struct dentry *dentry; +}; +extern struct kvm_stats_debugfs_item debugfs_entries[]; +extern struct dentry *kvm_debugfs_dir; + +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER +static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) +{ + if (unlikely(vcpu->kvm->mmu_notifier_count)) + return 1; + /* + * Both reads happen under the mmu_lock and both values are + * modified under mmu_lock, so there's no need of smb_rmb() + * here in between, otherwise mmu_notifier_count should be + * read before mmu_notifier_seq, see + * mmu_notifier_invalidate_range_end write side. + */ + if (vcpu->kvm->mmu_notifier_seq != mmu_seq) + return 1; + return 0; +} +#endif + +#ifdef CONFIG_HAVE_KVM_IRQCHIP + +#define KVM_MAX_IRQ_ROUTES 1024 + +int kvm_setup_default_irq_routing(struct kvm *kvm); +int kvm_set_irq_routing(struct kvm *kvm, + const struct kvm_irq_routing_entry *entries, + unsigned nr, + unsigned flags); +void kvm_free_irq_routing(struct kvm *kvm); + +#else + +static inline void kvm_free_irq_routing(struct kvm *kvm) {} + +#endif + +#ifdef CONFIG_HAVE_KVM_EVENTFD + +void kvm_eventfd_init(struct kvm *kvm); +int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); +void kvm_irqfd_release(struct kvm *kvm); +void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); +int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); + +#else + +static inline void kvm_eventfd_init(struct kvm *kvm) {} + +static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +{ + return -EINVAL; +} + +static inline void kvm_irqfd_release(struct kvm *kvm) {} + +#ifdef CONFIG_HAVE_KVM_IRQCHIP +static inline void kvm_irq_routing_update(struct kvm *kvm, + struct kvm_irq_routing_table *irq_rt) +{ + rcu_assign_pointer(kvm->irq_routing, irq_rt); +} +#endif + +static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + return -ENOSYS; +} + +#endif /* CONFIG_HAVE_KVM_EVENTFD */ + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE +static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; +} +#endif + +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT + +long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg); + +#else + +static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg) +{ + return -ENOTTY; +} + +#endif + +static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) +{ + set_bit(req, &vcpu->requests); +} + +static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu) +{ + return test_and_set_bit(req, &vcpu->requests); +} + +static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) +{ + if (test_bit(req, &vcpu->requests)) { + clear_bit(req, &vcpu->requests); + return true; + } else { + return false; + } +} + +#endif + diff --git a/linux/include/linux/kvm_para.h b/linux/include/linux/kvm_para.h new file mode 100644 index 0000000..e13e53d --- /dev/null +++ b/linux/include/linux/kvm_para.h @@ -0,0 +1,78 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#ifndef __LINUX_KVM_PARA_H +#define __LINUX_KVM_PARA_H + +/* + * This header file provides a method for making a hypercall to the host + * Architectures should define: + * - kvm_hypercall0, kvm_hypercall1... + * - kvm_arch_para_features + * - kvm_para_available + */ + +/* Return values for hypercalls */ +#define KVM_ENOSYS 1000 +#define KVM_EFAULT EFAULT +#define KVM_E2BIG E2BIG +#define KVM_EPERM EPERM + +#define KVM_HC_VAPIC_POLL_IRQ 1 +#define KVM_HC_MMU_OP 2 +#define KVM_HC_FEATURES 3 +#define KVM_HC_PPC_MAP_MAGIC_PAGE 4 + +/* + * hypercalls use architecture specific + */ +#include <asm/kvm_para.h> + +#ifdef __KERNEL__ + +static inline int kvm_para_has_feature(unsigned int feature) +{ + if (kvm_arch_para_features() & (1UL << feature)) + return 1; + return 0; +} +#endif /* __KERNEL__ */ +#endif /* __LINUX_KVM_PARA_H */ + diff --git a/linux/include/linux/kvm_types.h b/linux/include/linux/kvm_types.h new file mode 100644 index 0000000..e35297a --- /dev/null +++ b/linux/include/linux/kvm_types.h @@ -0,0 +1,117 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef __KVM_TYPES_H__ +#define __KVM_TYPES_H__ + +#include <asm/types.h> + +/* + * Address types: + * + * gva - guest virtual address + * gpa - guest physical address + * gfn - guest frame number + * hva - host virtual address + * hpa - host physical address + * hfn - host frame number + */ + +typedef unsigned long gva_t; +typedef u64 gpa_t; +typedef u64 gfn_t; + +typedef unsigned long hva_t; +typedef u64 hpa_t; +typedef u64 hfn_t; + +typedef hfn_t pfn_t; + +union kvm_ioapic_redirect_entry { + u64 bits; + struct { + u8 vector; + u8 delivery_mode:3; + u8 dest_mode:1; + u8 delivery_status:1; + u8 polarity:1; + u8 remote_irr:1; + u8 trig_mode:1; + u8 mask:1; + u8 reserve:7; + u8 reserved[4]; + u8 dest_id; + } fields; +}; + +struct kvm_lapic_irq { + u32 vector; + u32 delivery_mode; + u32 dest_mode; + u32 level; + u32 trig_mode; + u32 shorthand; + u32 dest_id; +}; + +struct gfn_to_hva_cache { + u64 generation; + gpa_t gpa; + unsigned long hva; + struct kvm_memory_slot *memslot; +}; + +#endif /* __KVM_TYPES_H__ */ diff --git a/linux/include/trace/events/kvm.h b/linux/include/trace/events/kvm.h new file mode 100644 index 0000000..1c1c1f7 --- /dev/null +++ b/linux/include/trace/events/kvm.h @@ -0,0 +1,352 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +#if !defined(_TRACE_KVM_MAIN_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_MAIN_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +#define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x } + +#define kvm_trace_exit_reason \ + ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \ + ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ + ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ + ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ + ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI) + +TRACE_EVENT(kvm_userspace_exit, + TP_PROTO(__u32 reason, int errno), + TP_ARGS(reason, errno), + + TP_STRUCT__entry( + __field( __u32, reason ) + __field( int, errno ) + ), + + TP_fast_assign( + __entry->reason = reason; + __entry->errno = errno; + ), + + TP_printk("reason %s (%d)", + __entry->errno < 0 ? + (__entry->errno == -EINTR ? "restart" : "error") : + __print_symbolic(__entry->reason, kvm_trace_exit_reason), + __entry->errno < 0 ? -__entry->errno : __entry->reason) +); + +#if defined(__KVM_HAVE_IOAPIC) +TRACE_EVENT(kvm_set_irq, + TP_PROTO(unsigned int gsi, int level, int irq_source_id), + TP_ARGS(gsi, level, irq_source_id), + + TP_STRUCT__entry( + __field( unsigned int, gsi ) + __field( int, level ) + __field( int, irq_source_id ) + ), + + TP_fast_assign( + __entry->gsi = gsi; + __entry->level = level; + __entry->irq_source_id = irq_source_id; + ), + + TP_printk("gsi %u level %d source %d", + __entry->gsi, __entry->level, __entry->irq_source_id) +); + +#define kvm_deliver_mode \ + {0x0, "Fixed"}, \ + {0x1, "LowPrio"}, \ + {0x2, "SMI"}, \ + {0x3, "Res3"}, \ + {0x4, "NMI"}, \ + {0x5, "INIT"}, \ + {0x6, "SIPI"}, \ + {0x7, "ExtINT"} + +TRACE_EVENT(kvm_ioapic_set_irq, + TP_PROTO(__u64 e, int pin, bool coalesced), + TP_ARGS(e, pin, coalesced), + + TP_STRUCT__entry( + __field( __u64, e ) + __field( int, pin ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->e = e; + __entry->pin = pin; + __entry->coalesced = coalesced; + ), + + TP_printk("pin %u dst %x vec=%u (%s|%s|%s%s)%s", + __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), + (__entry->e & (1<<11)) ? "logical" : "physical", + (__entry->e & (1<<15)) ? "level" : "edge", + (__entry->e & (1<<16)) ? "|masked" : "", + __entry->coalesced ? " (coalesced)" : "") +); + +TRACE_EVENT(kvm_msi_set_irq, + TP_PROTO(__u64 address, __u64 data), + TP_ARGS(address, data), + + TP_STRUCT__entry( + __field( __u64, address ) + __field( __u64, data ) + ), + + TP_fast_assign( + __entry->address = address; + __entry->data = data; + ), + + TP_printk("dst %u vec %x (%s|%s|%s%s)", + (u8)(__entry->address >> 12), (u8)__entry->data, + __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), + (__entry->address & (1<<2)) ? "logical" : "physical", + (__entry->data & (1<<15)) ? "level" : "edge", + (__entry->address & (1<<3)) ? "|rh" : "") +); + +#define kvm_irqchips \ + {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ + {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ + {KVM_IRQCHIP_IOAPIC, "IOAPIC"} + +TRACE_EVENT(kvm_ack_irq, + TP_PROTO(unsigned int irqchip, unsigned int pin), + TP_ARGS(irqchip, pin), + + TP_STRUCT__entry( + __field( unsigned int, irqchip ) + __field( unsigned int, pin ) + ), + + TP_fast_assign( + __entry->irqchip = irqchip; + __entry->pin = pin; + ), + + TP_printk("irqchip %s pin %u", + __print_symbolic(__entry->irqchip, kvm_irqchips), + __entry->pin) +); + + + +#endif /* defined(__KVM_HAVE_IOAPIC) */ + +#define KVM_TRACE_MMIO_READ_UNSATISFIED 0 +#define KVM_TRACE_MMIO_READ 1 +#define KVM_TRACE_MMIO_WRITE 2 + +#define kvm_trace_symbol_mmio \ + { KVM_TRACE_MMIO_READ_UNSATISFIED, "unsatisfied-read" }, \ + { KVM_TRACE_MMIO_READ, "read" }, \ + { KVM_TRACE_MMIO_WRITE, "write" } + +TRACE_EVENT(kvm_mmio, + TP_PROTO(int type, int len, u64 gpa, u64 val), + TP_ARGS(type, len, gpa, val), + + TP_STRUCT__entry( + __field( u32, type ) + __field( u32, len ) + __field( u64, gpa ) + __field( u64, val ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->len = len; + __entry->gpa = gpa; + __entry->val = val; + ), + + TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx", + __print_symbolic(__entry->type, kvm_trace_symbol_mmio), + __entry->len, __entry->gpa, __entry->val) +); + +#define kvm_fpu_load_symbol \ + {0, "unload"}, \ + {1, "load"} + +TRACE_EVENT(kvm_fpu, + TP_PROTO(int load), + TP_ARGS(load), + + TP_STRUCT__entry( + __field( u32, load ) + ), + + TP_fast_assign( + __entry->load = load; + ), + + TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol)) +); + +TRACE_EVENT(kvm_age_page, + TP_PROTO(ulong hva, struct kvm_memory_slot *slot, int ref), + TP_ARGS(hva, slot, ref), + + TP_STRUCT__entry( + __field( u64, hva ) + __field( u64, gfn ) + __field( u8, referenced ) + ), + + TP_fast_assign( + __entry->hva = hva; + __entry->gfn = + slot->base_gfn + ((hva - slot->userspace_addr) >> PAGE_SHIFT); + __entry->referenced = ref; + ), + + TP_printk("hva %llx gfn %llx %s", + __entry->hva, __entry->gfn, + __entry->referenced ? "YOUNG" : "OLD") +); + +#ifdef CONFIG_KVM_ASYNC_PF +DECLARE_EVENT_CLASS(kvm_async_get_page_class, + + TP_PROTO(u64 gva, u64 gfn), + + TP_ARGS(gva, gfn), + + TP_STRUCT__entry( + __field(__u64, gva) + __field(u64, gfn) + ), + + TP_fast_assign( + __entry->gva = gva; + __entry->gfn = gfn; + ), + + TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) +); + +DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page, + + TP_PROTO(u64 gva, u64 gfn), + + TP_ARGS(gva, gfn) +); + +DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault, + + TP_PROTO(u64 gva, u64 gfn), + + TP_ARGS(gva, gfn) +); + +DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready, + + TP_PROTO(u64 token, u64 gva), + + TP_ARGS(token, gva), + + TP_STRUCT__entry( + __field(__u64, token) + __field(__u64, gva) + ), + + TP_fast_assign( + __entry->token = token; + __entry->gva = gva; + ), + + TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva) + +); + +DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present, + + TP_PROTO(u64 token, u64 gva), + + TP_ARGS(token, gva) +); + +DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready, + + TP_PROTO(u64 token, u64 gva), + + TP_ARGS(token, gva) +); + +TRACE_EVENT( + kvm_async_pf_completed, + TP_PROTO(unsigned long address, struct page *page, u64 gva), + TP_ARGS(address, page, gva), + + TP_STRUCT__entry( + __field(unsigned long, address) + __field(pfn_t, pfn) + __field(u64, gva) + ), + + TP_fast_assign( + __entry->address = address; + __entry->pfn = page ? page_to_pfn(page) : 0; + __entry->gva = gva; + ), + + TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva, + __entry->address, __entry->pfn) +); + +#endif + +#endif /* _TRACE_KVM_MAIN_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/linux/usr/include/asm-ia64/kvm.h b/linux/usr/include/asm-ia64/kvm.h new file mode 100644 index 0000000..bc90c75 --- /dev/null +++ b/linux/usr/include/asm-ia64/kvm.h @@ -0,0 +1,264 @@ +#ifndef __ASM_IA64_KVM_H +#define __ASM_IA64_KVM_H + +/* + * kvm structure definitions for ia64 + * + * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include <linux/types.h> +#include <linux/ioctl.h> + +/* Select x86 specific features in <linux/kvm.h> */ +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT + +/* Architectural interrupt line count. */ +#define KVM_NR_INTERRUPTS 256 + +#define KVM_IOAPIC_NUM_PINS 48 + +struct kvm_ioapic_state { + __u64 base_address; + __u32 ioregsel; + __u32 id; + __u32 irr; + __u32 pad; + union { + __u64 bits; + struct { + __u8 vector; + __u8 delivery_mode:3; + __u8 dest_mode:1; + __u8 delivery_status:1; + __u8 polarity:1; + __u8 remote_irr:1; + __u8 trig_mode:1; + __u8 mask:1; + __u8 reserve:7; + __u8 reserved[4]; + __u8 dest_id; + } fields; + } redirtbl[KVM_IOAPIC_NUM_PINS]; +}; + +#define KVM_IRQCHIP_PIC_MASTER 0 +#define KVM_IRQCHIP_PIC_SLAVE 1 +#define KVM_IRQCHIP_IOAPIC 2 +#define KVM_NR_IRQCHIPS 3 + +#define KVM_CONTEXT_SIZE 8*1024 + +struct kvm_fpreg { + union { + unsigned long bits[2]; + long double __dummy; /* force 16-byte alignment */ + } u; +}; + +union context { + /* 8K size */ + char dummy[KVM_CONTEXT_SIZE]; + struct { + unsigned long psr; + unsigned long pr; + unsigned long caller_unat; + unsigned long pad; + unsigned long gr[32]; + unsigned long ar[128]; + unsigned long br[8]; + unsigned long cr[128]; + unsigned long rr[8]; + unsigned long ibr[8]; + unsigned long dbr[8]; + unsigned long pkr[8]; + struct kvm_fpreg fr[128]; + }; +}; + +struct thash_data { + union { + struct { + unsigned long p : 1; /* 0 */ + unsigned long rv1 : 1; /* 1 */ + unsigned long ma : 3; /* 2-4 */ + unsigned long a : 1; /* 5 */ + unsigned long d : 1; /* 6 */ + unsigned long pl : 2; /* 7-8 */ + unsigned long ar : 3; /* 9-11 */ + unsigned long ppn : 38; /* 12-49 */ + unsigned long rv2 : 2; /* 50-51 */ + unsigned long ed : 1; /* 52 */ + unsigned long ig1 : 11; /* 53-63 */ + }; + struct { + unsigned long __rv1 : 53; /* 0-52 */ + unsigned long contiguous : 1; /*53 */ + unsigned long tc : 1; /* 54 TR or TC */ + unsigned long cl : 1; + /* 55 I side or D side cache line */ + unsigned long len : 4; /* 56-59 */ + unsigned long io : 1; /* 60 entry is for io or not */ + unsigned long nomap : 1; + /* 61 entry cann't be inserted into machine TLB.*/ + unsigned long checked : 1; + /* 62 for VTLB/VHPT sanity check */ + unsigned long invalid : 1; + /* 63 invalid entry */ + }; + unsigned long page_flags; + }; /* same for VHPT and TLB */ + + union { + struct { + unsigned long rv3 : 2; + unsigned long ps : 6; + unsigned long key : 24; + unsigned long rv4 : 32; + }; + unsigned long itir; + }; + union { + struct { + unsigned long ig2 : 12; + unsigned long vpn : 49; + unsigned long vrn : 3; + }; + unsigned long ifa; + unsigned long vadr; + struct { + unsigned long tag : 63; + unsigned long ti : 1; + }; + unsigned long etag; + }; + union { + struct thash_data *next; + unsigned long rid; + unsigned long gpaddr; + }; +}; + +#define NITRS 8 +#define NDTRS 8 + +struct saved_vpd { + unsigned long vhpi; + unsigned long vgr[16]; + unsigned long vbgr[16]; + unsigned long vnat; + unsigned long vbnat; + unsigned long vcpuid[5]; + unsigned long vpsr; + unsigned long vpr; + union { + unsigned long vcr[128]; + struct { + unsigned long dcr; + unsigned long itm; + unsigned long iva; + unsigned long rsv1[5]; + unsigned long pta; + unsigned long rsv2[7]; + unsigned long ipsr; + unsigned long isr; + unsigned long rsv3; + unsigned long iip; + unsigned long ifa; + unsigned long itir; + unsigned long iipa; + unsigned long ifs; + unsigned long iim; + unsigned long iha; + unsigned long rsv4[38]; + unsigned long lid; + unsigned long ivr; + unsigned long tpr; + unsigned long eoi; + unsigned long irr[4]; + unsigned long itv; + unsigned long pmv; + unsigned long cmcv; + unsigned long rsv5[5]; + unsigned long lrr0; + unsigned long lrr1; + unsigned long rsv6[46]; + }; + }; +}; + +struct kvm_regs { + struct saved_vpd vpd; + /*Arch-regs*/ + int mp_state; + unsigned long vmm_rr; + /* TR and TC. */ + struct thash_data itrs[NITRS]; + struct thash_data dtrs[NDTRS]; + /* Bit is set if there is a tr/tc for the region. */ + unsigned char itr_regions; + unsigned char dtr_regions; + unsigned char tc_regions; + + char irq_check; + unsigned long saved_itc; + unsigned long itc_check; + unsigned long timer_check; + unsigned long timer_pending; + unsigned long last_itc; + + unsigned long vrr[8]; + unsigned long ibr[8]; + unsigned long dbr[8]; + unsigned long insvc[4]; /* Interrupt in service. */ + unsigned long xtp; + + unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */ + unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */ + unsigned long metaphysical_saved_rr0; /* from kvm_arch */ + unsigned long metaphysical_saved_rr4; /* from kvm_arch */ + unsigned long fp_psr; /*used for lazy float register */ + unsigned long saved_gp; + /*for phycial emulation */ + + union context saved_guest; + + unsigned long reserved[64]; /* for future use */ +}; + +struct kvm_sregs { +}; + +struct kvm_fpu { +}; + +#define KVM_IA64_VCPU_STACK_SHIFT 16 +#define KVM_IA64_VCPU_STACK_SIZE (1UL << KVM_IA64_VCPU_STACK_SHIFT) + +struct kvm_ia64_vcpu_stack { + unsigned char stack[KVM_IA64_VCPU_STACK_SIZE]; +}; + +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + +#endif diff --git a/linux/usr/include/asm-ia64/kvm_para.h b/linux/usr/include/asm-ia64/kvm_para.h new file mode 100644 index 0000000..0310eb4 --- /dev/null +++ b/linux/usr/include/asm-ia64/kvm_para.h @@ -0,0 +1,23 @@ +#ifndef __IA64_KVM_PARA_H +#define __IA64_KVM_PARA_H + +/* + * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + + +#endif diff --git a/linux/usr/include/asm-x86/hyperv.h b/linux/usr/include/asm-x86/hyperv.h new file mode 100644 index 0000000..5df477a --- /dev/null +++ b/linux/usr/include/asm-x86/hyperv.h @@ -0,0 +1,193 @@ +#ifndef _ASM_X86_HYPERV_H +#define _ASM_X86_HYPERV_H + +#include <linux/types.h> + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). + */ +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 + +#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 +#define HYPERV_CPUID_MIN 0x40000005 +#define HYPERV_CPUID_MAX 0x4000ffff + +/* + * Feature identification. EAX indicates which features are available + * to the partition based upon the current partition privileges. + */ + +/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) +/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ +#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* + * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM + * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available + */ +#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) +/* + * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through + * HV_X64_MSR_STIMER3_COUNT) available + */ +#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3) +/* + * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) + * are available + */ +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) +/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ +#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) +/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ +#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) +/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ +#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) + /* + * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, + * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available + */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) + +/* + * Feature identification: EBX indicates which flags were specified at + * partition creation. The format is the same as the partition creation + * flag structure defined in section Partition Creation Flags. + */ +#define HV_X64_CREATE_PARTITIONS (1 << 0) +#define HV_X64_ACCESS_PARTITION_ID (1 << 1) +#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) +#define HV_X64_POST_MESSAGES (1 << 4) +#define HV_X64_SIGNAL_EVENTS (1 << 5) +#define HV_X64_CREATE_PORT (1 << 6) +#define HV_X64_CONNECT_PORT (1 << 7) +#define HV_X64_ACCESS_STATS (1 << 8) +#define HV_X64_DEBUGGING (1 << 11) +#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) +#define HV_X64_CONFIGURE_PROFILER (1 << 13) + +/* + * Feature identification. EDX indicates which miscellaneous features + * are available to the partition. + */ +/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ +#define HV_X64_MWAIT_AVAILABLE (1 << 0) +/* Guest debugging support is available */ +#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) +/* Performance Monitor support is available*/ +#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) +/* Support for physical CPU dynamic partitioning events is available*/ +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) +/* + * Support for passing hypercall input parameter block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) +/* Support for a virtual guest idle state is available */ +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) + +/* + * Implementation recommendations. Indicates which behaviors the hypervisor + * recommends the OS implement for optimal performance. + */ + /* + * Recommend using hypercall for address space switches rather + * than MOV to CR3 instruction + */ +#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +/* Recommend using hypercall for local TLB flushes rather + * than INVLPG or MOV to CR3 instructions */ +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) +/* + * Recommend using hypercall for remote TLB flushes rather + * than inter-processor interrupts + */ +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) +/* + * Recommend using MSRs for accessing APIC registers + * EOI, ICR and TPR rather than their memory-mapped counterparts + */ +#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) +/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ +#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) +/* + * Recommend using relaxed timing for this partition. If used, + * the VM should disable any watchdog timeouts that rely on the + * timely delivery of external interrupts + */ +#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) + +/* MSR used to identify the guest OS. */ +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 + +/* MSR used to setup pages used to communicate with the hypervisor. */ +#define HV_X64_MSR_HYPERCALL 0x40000001 + +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX 0x40000002 + +/* MSR used to read the per-partition time reference counter */ +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 + +/* Define the virtual APIC registers */ +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073 + +/* Define synthetic interrupt controller model specific registers. */ +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F + + +#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) + +/* Declare the various hypercall operations. */ +#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008 + +#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +#define HV_PROCESSOR_POWER_STATE_C0 0 +#define HV_PROCESSOR_POWER_STATE_C1 1 +#define HV_PROCESSOR_POWER_STATE_C2 2 +#define HV_PROCESSOR_POWER_STATE_C3 3 + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 + +#endif diff --git a/linux/usr/include/asm-x86/kvm.h b/linux/usr/include/asm-x86/kvm.h new file mode 100644 index 0000000..4d8dcbd --- /dev/null +++ b/linux/usr/include/asm-x86/kvm.h @@ -0,0 +1,324 @@ +#ifndef _ASM_X86_KVM_H +#define _ASM_X86_KVM_H + +/* + * KVM x86 specific structures and definitions + * + */ + +#include <linux/types.h> +#include <linux/ioctl.h> + +/* Select x86 specific features in <linux/kvm.h> */ +#define __KVM_HAVE_PIT +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT +#define __KVM_HAVE_MSI +#define __KVM_HAVE_USER_NMI +#define __KVM_HAVE_GUEST_DEBUG +#define __KVM_HAVE_MSIX +#define __KVM_HAVE_MCE +#define __KVM_HAVE_PIT_STATE2 +#define __KVM_HAVE_XEN_HVM +#define __KVM_HAVE_VCPU_EVENTS +#define __KVM_HAVE_DEBUGREGS +#define __KVM_HAVE_XSAVE +#define __KVM_HAVE_XCRS + +/* Architectural interrupt line count. */ +#define KVM_NR_INTERRUPTS 256 + +struct kvm_memory_alias { + __u32 slot; /* this has a different namespace than memory slots */ + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 target_phys_addr; +}; + +/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ +struct kvm_pic_state { + __u8 last_irr; /* edge detection */ + __u8 irr; /* interrupt request register */ + __u8 imr; /* interrupt mask register */ + __u8 isr; /* interrupt service register */ + __u8 priority_add; /* highest irq priority */ + __u8 irq_base; + __u8 read_reg_select; + __u8 poll; + __u8 special_mask; + __u8 init_state; + __u8 auto_eoi; + __u8 rotate_on_auto_eoi; + __u8 special_fully_nested_mode; + __u8 init4; /* true if 4 byte init */ + __u8 elcr; /* PIIX edge/trigger selection */ + __u8 elcr_mask; +}; + +#define KVM_IOAPIC_NUM_PINS 24 +struct kvm_ioapic_state { + __u64 base_address; + __u32 ioregsel; + __u32 id; + __u32 irr; + __u32 pad; + union { + __u64 bits; + struct { + __u8 vector; + __u8 delivery_mode:3; + __u8 dest_mode:1; + __u8 delivery_status:1; + __u8 polarity:1; + __u8 remote_irr:1; + __u8 trig_mode:1; + __u8 mask:1; + __u8 reserve:7; + __u8 reserved[4]; + __u8 dest_id; + } fields; + } redirtbl[KVM_IOAPIC_NUM_PINS]; +}; + +#define KVM_IRQCHIP_PIC_MASTER 0 +#define KVM_IRQCHIP_PIC_SLAVE 1 +#define KVM_IRQCHIP_IOAPIC 2 +#define KVM_NR_IRQCHIPS 3 + +/* for KVM_GET_REGS and KVM_SET_REGS */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +/* for KVM_GET_LAPIC and KVM_SET_LAPIC */ +#define KVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; +}; + +struct kvm_segment { + __u64 base; + __u32 limit; + __u16 selector; + __u8 type; + __u8 present, dpl, db, s, l, g, avl; + __u8 unusable; + __u8 padding; +}; + +struct kvm_dtable { + __u64 base; + __u16 limit; + __u16 padding[3]; +}; + + +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { + /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; +}; + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +/* for KVM_GET_MSRS and KVM_SET_MSRS */ +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +/* for KVM_GET_MSR_INDEX_LIST */ +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + + +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 +#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 +#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 + +/* for KVM_SET_CPUID2 */ +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +/* for KVM_GET_PIT and KVM_SET_PIT */ +struct kvm_pit_channel_state { + __u32 count; /* can be 65536 */ + __u16 latched_count; + __u8 count_latched; + __u8 status_latched; + __u8 status; + __u8 read_state; + __u8 write_state; + __u8 write_latch; + __u8 rw_mode; + __u8 mode; + __u8 bcd; + __u8 gate; + __s64 count_load_time; +}; + +struct kvm_debug_exit_arch { + __u32 exception; + __u32 pad; + __u64 pc; + __u64 dr6; + __u64 dr7; +}; + +#define KVM_GUESTDBG_USE_SW_BP 0x00010000 +#define KVM_GUESTDBG_USE_HW_BP 0x00020000 +#define KVM_GUESTDBG_INJECT_DB 0x00040000 +#define KVM_GUESTDBG_INJECT_BP 0x00080000 + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { + __u64 debugreg[8]; +}; + +struct kvm_pit_state { + struct kvm_pit_channel_state channels[3]; +}; + +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + +struct kvm_pit_state2 { + struct kvm_pit_channel_state channels[3]; + __u32 flags; + __u32 reserved[9]; +}; + +struct kvm_reinject_control { + __u8 pit_reinject; + __u8 reserved[31]; +}; + +/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ +#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 +#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 +#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 + +/* Interrupt shadow states */ +#define KVM_X86_SHADOW_INT_MOV_SS 0x01 +#define KVM_X86_SHADOW_INT_STI 0x02 + +/* for KVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pad; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 shadow; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; + __u32 reserved[10]; +}; + +/* for KVM_GET/SET_DEBUGREGS */ +struct kvm_debugregs { + __u64 db[4]; + __u64 dr6; + __u64 dr7; + __u64 flags; + __u64 reserved[9]; +}; + +/* for KVM_CAP_XSAVE */ +struct kvm_xsave { + __u32 region[1024]; +}; + +#define KVM_MAX_XCRS 16 + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + +#endif /* _ASM_X86_KVM_H */ diff --git a/linux/usr/include/asm-x86/kvm_para.h b/linux/usr/include/asm-x86/kvm_para.h new file mode 100644 index 0000000..834d71e --- /dev/null +++ b/linux/usr/include/asm-x86/kvm_para.h @@ -0,0 +1,79 @@ +#ifndef _ASM_X86_KVM_PARA_H +#define _ASM_X86_KVM_PARA_H + +#include <linux/types.h> +#include <asm/hyperv.h> + +/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It + * should be used to determine that a VM is running under KVM. + */ +#define KVM_CPUID_SIGNATURE 0x40000000 + +/* This CPUID returns a feature bitmap in eax. Before enabling a particular + * paravirtualization, the appropriate feature bit should be checked. + */ +#define KVM_CPUID_FEATURES 0x40000001 +#define KVM_FEATURE_CLOCKSOURCE 0 +#define KVM_FEATURE_NOP_IO_DELAY 1 +#define KVM_FEATURE_MMU_OP 2 +/* This indicates that the new set of kvmclock msrs + * are available. The use of 0x11 and 0x12 is deprecated + */ +#define KVM_FEATURE_CLOCKSOURCE2 3 +#define KVM_FEATURE_ASYNC_PF 4 + +/* The last 8 bits are used to indicate how to interpret the flags field + * in pvclock structure. If no bits are set, all flags are ignored. + */ +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 + +#define MSR_KVM_WALL_CLOCK 0x11 +#define MSR_KVM_SYSTEM_TIME 0x12 + +/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ +#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 +#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 +#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 + +#define KVM_MAX_MMU_OP_BATCH 32 + +#define KVM_ASYNC_PF_ENABLED (1 << 0) +#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) + +/* Operations for KVM_HC_MMU_OP */ +#define KVM_MMU_OP_WRITE_PTE 1 +#define KVM_MMU_OP_FLUSH_TLB 2 +#define KVM_MMU_OP_RELEASE_PT 3 + +/* Payload for KVM_HC_MMU_OP */ +struct kvm_mmu_op_header { + __u32 op; + __u32 pad; +}; + +struct kvm_mmu_op_write_pte { + struct kvm_mmu_op_header header; + __u64 pte_phys; + __u64 pte_val; +}; + +struct kvm_mmu_op_flush_tlb { + struct kvm_mmu_op_header header; +}; + +struct kvm_mmu_op_release_pt { + struct kvm_mmu_op_header header; + __u64 pt_phys; +}; + +#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 +#define KVM_PV_REASON_PAGE_READY 2 + +struct kvm_vcpu_pv_apf_data { + __u32 reason; + __u8 pad[60]; + __u32 enabled; +}; + + +#endif /* _ASM_X86_KVM_PARA_H */ diff --git a/linux/usr/include/linux/kvm.h b/linux/usr/include/linux/kvm.h new file mode 100644 index 0000000..27efcfb --- /dev/null +++ b/linux/usr/include/linux/kvm.h @@ -0,0 +1,798 @@ +#ifndef __LINUX_KVM_H +#define __LINUX_KVM_H + +/* + * Userspace interface for /dev/kvm - kernel based virtual machine + * + * Note: you must update KVM_API_VERSION if you change this interface. + */ + +#include <linux/types.h> + +#include <linux/ioctl.h> +#include <asm/kvm.h> + +#define KVM_API_VERSION 12 + +/* *** Deprecated interfaces *** */ + +#define KVM_TRC_SHIFT 16 + +#define KVM_TRC_ENTRYEXIT (1 << KVM_TRC_SHIFT) +#define KVM_TRC_HANDLER (1 << (KVM_TRC_SHIFT + 1)) + +#define KVM_TRC_VMENTRY (KVM_TRC_ENTRYEXIT + 0x01) +#define KVM_TRC_VMEXIT (KVM_TRC_ENTRYEXIT + 0x02) +#define KVM_TRC_PAGE_FAULT (KVM_TRC_HANDLER + 0x01) + +#define KVM_TRC_HEAD_SIZE 12 +#define KVM_TRC_CYCLE_SIZE 8 +#define KVM_TRC_EXTRA_MAX 7 + +#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) +#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) +#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) +#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05) +#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06) +#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07) +#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08) +#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09) +#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A) +#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B) +#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C) +#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D) +#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E) +#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F) +#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10) +#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11) +#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12) +#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13) +#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14) +#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15) +#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16) +#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17) +#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18) +#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19) + +struct kvm_user_trace_setup { + __u32 buf_size; + __u32 buf_nr; +}; + +#define __KVM_DEPRECATED_MAIN_W_0x06 \ + _IOW(KVMIO, 0x06, struct kvm_user_trace_setup) +#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07) +#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08) + +#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq) + +struct kvm_breakpoint { + __u32 enabled; + __u32 padding; + __u64 address; +}; + +struct kvm_debug_guest { + __u32 enabled; + __u32 pad; + struct kvm_breakpoint breakpoints[4]; + __u32 singlestep; +}; + +#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest) + +/* *** End of deprecated interfaces *** */ + + +/* for KVM_CREATE_MEMORY_REGION */ +struct kvm_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ +}; + +/* for KVM_SET_USER_MEMORY_REGION */ +struct kvm_userspace_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ +}; + +/* for kvm_memory_region::flags */ +#define KVM_MEM_LOG_DIRTY_PAGES 1UL +#define KVM_MEMSLOT_INVALID (1UL << 1) + +/* for KVM_IRQ_LINE */ +struct kvm_irq_level { + /* + * ACPI gsi notion of irq. + * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. + * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. + */ + union { + __u32 irq; + __s32 status; + }; + __u32 level; +}; + + +struct kvm_irqchip { + __u32 chip_id; + __u32 pad; + union { + char dummy[512]; /* reserving space */ +#ifdef __KVM_HAVE_PIT + struct kvm_pic_state pic; +#endif +#ifdef __KVM_HAVE_IOAPIC + struct kvm_ioapic_state ioapic; +#endif + } chip; +}; + +/* for KVM_CREATE_PIT2 */ +struct kvm_pit_config { + __u32 flags; + __u32 pad[15]; +}; + +#define KVM_PIT_SPEAKER_DUMMY 1 + +#define KVM_EXIT_UNKNOWN 0 +#define KVM_EXIT_EXCEPTION 1 +#define KVM_EXIT_IO 2 +#define KVM_EXIT_HYPERCALL 3 +#define KVM_EXIT_DEBUG 4 +#define KVM_EXIT_HLT 5 +#define KVM_EXIT_MMIO 6 +#define KVM_EXIT_IRQ_WINDOW_OPEN 7 +#define KVM_EXIT_SHUTDOWN 8 +#define KVM_EXIT_FAIL_ENTRY 9 +#define KVM_EXIT_INTR 10 +#define KVM_EXIT_SET_TPR 11 +#define KVM_EXIT_TPR_ACCESS 12 +#define KVM_EXIT_S390_SIEIC 13 +#define KVM_EXIT_S390_RESET 14 +#define KVM_EXIT_DCR 15 +#define KVM_EXIT_NMI 16 +#define KVM_EXIT_INTERNAL_ERROR 17 +#define KVM_EXIT_OSI 18 + +/* For KVM_EXIT_INTERNAL_ERROR */ +#define KVM_INTERNAL_ERROR_EMULATION 1 +#define KVM_INTERNAL_ERROR_SIMUL_EX 2 + +/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ +struct kvm_run { + /* in */ + __u8 request_interrupt_window; + __u8 padding1[7]; + + /* out */ + __u32 exit_reason; + __u8 ready_for_interrupt_injection; + __u8 if_flag; + __u8 padding2[2]; + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + __u64 apic_base; + +#ifdef __KVM_S390 + /* the processor status word for s390 */ + __u64 psw_mask; /* psw upper half */ + __u64 psw_addr; /* psw lower half */ +#endif + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + struct { + struct kvm_debug_exit_arch arch; + } debug; + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + /* KVM_EXIT_S390_SIEIC */ + struct { + __u8 icptcode; + __u16 ipa; + __u32 ipb; + } s390_sieic; + /* KVM_EXIT_S390_RESET */ +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + __u64 s390_reset_flags; + /* KVM_EXIT_DCR */ + struct { + __u32 dcrn; + __u32 data; + __u8 is_write; + } dcr; + struct { + __u32 suberror; + /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */ + __u32 ndata; + __u64 data[16]; + } internal; + /* KVM_EXIT_OSI */ + struct { + __u64 gprs[32]; + } osi; + /* Fix the size of the union. */ + char padding[256]; + }; +}; + +/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */ + +struct kvm_coalesced_mmio_zone { + __u64 addr; + __u32 size; + __u32 pad; +}; + +struct kvm_coalesced_mmio { + __u64 phys_addr; + __u32 len; + __u32 pad; + __u8 data[8]; +}; + +struct kvm_coalesced_mmio_ring { + __u32 first, last; + struct kvm_coalesced_mmio coalesced_mmio[0]; +}; + +#define KVM_COALESCED_MMIO_MAX \ + ((PAGE_SIZE - sizeof(struct kvm_coalesced_mmio_ring)) / \ + sizeof(struct kvm_coalesced_mmio)) + +/* for KVM_TRANSLATE */ +struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; +}; + +/* for KVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 irq; +}; + +/* for KVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding1; + union { + void *dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; +}; + +/* for KVM_TPR_ACCESS_REPORTING */ +struct kvm_tpr_access_ctl { + __u32 enabled; + __u32 flags; + __u32 reserved[8]; +}; + +/* for KVM_SET_VAPIC_ADDR */ +struct kvm_vapic_addr { + __u64 vapic_addr; +}; + +/* for KVM_SET_MPSTATE */ + +#define KVM_MP_STATE_RUNNABLE 0 +#define KVM_MP_STATE_UNINITIALIZED 1 +#define KVM_MP_STATE_INIT_RECEIVED 2 +#define KVM_MP_STATE_HALTED 3 +#define KVM_MP_STATE_SIPI_RECEIVED 4 + +struct kvm_mp_state { + __u32 mp_state; +}; + +struct kvm_s390_psw { + __u64 mask; + __u64 addr; +}; + +/* valid values for type in kvm_s390_interrupt */ +#define KVM_S390_SIGP_STOP 0xfffe0000u +#define KVM_S390_PROGRAM_INT 0xfffe0001u +#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u +#define KVM_S390_RESTART 0xfffe0003u +#define KVM_S390_INT_VIRTIO 0xffff2603u +#define KVM_S390_INT_SERVICE 0xffff2401u +#define KVM_S390_INT_EMERGENCY 0xffff1201u + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +/* for KVM_SET_GUEST_DEBUG */ + +#define KVM_GUESTDBG_ENABLE 0x00000001 +#define KVM_GUESTDBG_SINGLESTEP 0x00000002 + +struct kvm_guest_debug { + __u32 control; + __u32 pad; + struct kvm_guest_debug_arch arch; +}; + +enum { + kvm_ioeventfd_flag_nr_datamatch, + kvm_ioeventfd_flag_nr_pio, + kvm_ioeventfd_flag_nr_deassign, + kvm_ioeventfd_flag_nr_max, +}; + +#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) +#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) +#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) + +#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) + +struct kvm_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 1, 2, 4, or 8 bytes */ + __s32 fd; + __u32 flags; + __u8 pad[36]; +}; + +/* for KVM_ENABLE_CAP */ +struct kvm_enable_cap { + /* in */ + __u32 cap; + __u32 flags; + __u64 args[4]; + __u8 pad[64]; +}; + +/* for KVM_PPC_GET_PVINFO */ +struct kvm_ppc_pvinfo { + /* out */ + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +#define KVMIO 0xAE + +/* + * ioctls for /dev/kvm fds: + */ +#define KVM_GET_API_VERSION _IO(KVMIO, 0x00) +#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ +#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 0x02, struct kvm_msr_list) + +#define KVM_S390_ENABLE_SIE _IO(KVMIO, 0x06) +/* + * Check if a kvm extension is available. Argument is extension number, + * return is 1 (yes) or 0 (no, sorry). + */ +#define KVM_CHECK_EXTENSION _IO(KVMIO, 0x03) +/* + * Get size for mmap(vcpu_fd) + */ +#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ +#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid2) +#define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06 +#define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07 +#define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08 + +/* + * Extension capability list. + */ +#define KVM_CAP_IRQCHIP 0 +#define KVM_CAP_HLT 1 +#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2 +#define KVM_CAP_USER_MEMORY 3 +#define KVM_CAP_SET_TSS_ADDR 4 +#define KVM_CAP_VAPIC 6 +#define KVM_CAP_EXT_CPUID 7 +#define KVM_CAP_CLOCKSOURCE 8 +#define KVM_CAP_NR_VCPUS 9 /* returns max vcpus per vm */ +#define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */ +#define KVM_CAP_PIT 11 +#define KVM_CAP_NOP_IO_DELAY 12 +#define KVM_CAP_PV_MMU 13 +#define KVM_CAP_MP_STATE 14 +#define KVM_CAP_COALESCED_MMIO 15 +#define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT +#define KVM_CAP_DEVICE_ASSIGNMENT 17 +#endif +#define KVM_CAP_IOMMU 18 +#ifdef __KVM_HAVE_MSI +#define KVM_CAP_DEVICE_MSI 20 +#endif +/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ +#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 +#ifdef __KVM_HAVE_USER_NMI +#define KVM_CAP_USER_NMI 22 +#endif +#ifdef __KVM_HAVE_GUEST_DEBUG +#define KVM_CAP_SET_GUEST_DEBUG 23 +#endif +#ifdef __KVM_HAVE_PIT +#define KVM_CAP_REINJECT_CONTROL 24 +#endif +#ifdef __KVM_HAVE_IOAPIC +#define KVM_CAP_IRQ_ROUTING 25 +#endif +#define KVM_CAP_IRQ_INJECT_STATUS 26 +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT +#define KVM_CAP_DEVICE_DEASSIGNMENT 27 +#endif +#ifdef __KVM_HAVE_MSIX +#define KVM_CAP_DEVICE_MSIX 28 +#endif +#define KVM_CAP_ASSIGN_DEV_IRQ 29 +/* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */ +#define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30 +#ifdef __KVM_HAVE_MCE +#define KVM_CAP_MCE 31 +#endif +#define KVM_CAP_IRQFD 32 +#ifdef __KVM_HAVE_PIT +#define KVM_CAP_PIT2 33 +#endif +#define KVM_CAP_SET_BOOT_CPU_ID 34 +#ifdef __KVM_HAVE_PIT_STATE2 +#define KVM_CAP_PIT_STATE2 35 +#endif +#define KVM_CAP_IOEVENTFD 36 +#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37 +#ifdef __KVM_HAVE_XEN_HVM +#define KVM_CAP_XEN_HVM 38 +#endif +#define KVM_CAP_ADJUST_CLOCK 39 +#define KVM_CAP_INTERNAL_ERROR_DATA 40 +#ifdef __KVM_HAVE_VCPU_EVENTS +#define KVM_CAP_VCPU_EVENTS 41 +#endif +#define KVM_CAP_S390_PSW 42 +#define KVM_CAP_PPC_SEGSTATE 43 +#define KVM_CAP_HYPERV 44 +#define KVM_CAP_HYPERV_VAPIC 45 +#define KVM_CAP_HYPERV_SPIN 46 +#define KVM_CAP_PCI_SEGMENT 47 +#define KVM_CAP_PPC_PAIRED_SINGLES 48 +#define KVM_CAP_INTR_SHADOW 49 +#ifdef __KVM_HAVE_DEBUGREGS +#define KVM_CAP_DEBUGREGS 50 +#endif +#define KVM_CAP_X86_ROBUST_SINGLESTEP 51 +#define KVM_CAP_PPC_OSI 52 +#define KVM_CAP_PPC_UNSET_IRQ 53 +#define KVM_CAP_ENABLE_CAP 54 +#ifdef __KVM_HAVE_XSAVE +#define KVM_CAP_XSAVE 55 +#endif +#ifdef __KVM_HAVE_XCRS +#define KVM_CAP_XCRS 56 +#endif +#define KVM_CAP_PPC_GET_PVINFO 57 +#define KVM_CAP_PPC_IRQ_LEVEL 58 +#define KVM_CAP_ASYNC_PF 59 + +#ifdef KVM_CAP_IRQ_ROUTING + +struct kvm_irq_routing_irqchip { + __u32 irqchip; + __u32 pin; +}; + +struct kvm_irq_routing_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 pad; +}; + +/* gsi routing entry types */ +#define KVM_IRQ_ROUTING_IRQCHIP 1 +#define KVM_IRQ_ROUTING_MSI 2 + +struct kvm_irq_routing_entry { + __u32 gsi; + __u32 type; + __u32 flags; + __u32 pad; + union { + struct kvm_irq_routing_irqchip irqchip; + struct kvm_irq_routing_msi msi; + __u32 pad[8]; + } u; +}; + +struct kvm_irq_routing { + __u32 nr; + __u32 flags; + struct kvm_irq_routing_entry entries[0]; +}; + +#endif + +#ifdef KVM_CAP_MCE +/* x86 MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; +#endif + +#ifdef KVM_CAP_XEN_HVM +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; +#endif + +#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) + +struct kvm_irqfd { + __u32 fd; + __u32 gsi; + __u32 flags; + __u8 pad[20]; +}; + +struct kvm_clock_data { + __u64 clock; + __u32 flags; + __u32 pad[9]; +}; + +/* + * ioctls for VM fds + */ +#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) +/* + * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns + * a vcpu fd. + */ +#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) +#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) +/* KVM_SET_MEMORY_ALIAS is obsolete: */ +#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) +#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) +#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) +#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ + struct kvm_userspace_memory_region) +#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) +#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +/* Device model IOC */ +#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) +#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) +#define KVM_GET_IRQCHIP _IOWR(KVMIO, 0x62, struct kvm_irqchip) +#define KVM_SET_IRQCHIP _IOR(KVMIO, 0x63, struct kvm_irqchip) +#define KVM_CREATE_PIT _IO(KVMIO, 0x64) +#define KVM_GET_PIT _IOWR(KVMIO, 0x65, struct kvm_pit_state) +#define KVM_SET_PIT _IOR(KVMIO, 0x66, struct kvm_pit_state) +#define KVM_IRQ_LINE_STATUS _IOWR(KVMIO, 0x67, struct kvm_irq_level) +#define KVM_REGISTER_COALESCED_MMIO \ + _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) +#define KVM_UNREGISTER_COALESCED_MMIO \ + _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) +#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \ + struct kvm_assigned_pci_dev) +#define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing) +/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */ +#define KVM_ASSIGN_IRQ __KVM_DEPRECATED_VM_R_0x70 +#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq) +#define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) +#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \ + struct kvm_assigned_pci_dev) +#define KVM_ASSIGN_SET_MSIX_NR _IOW(KVMIO, 0x73, \ + struct kvm_assigned_msix_nr) +#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO, 0x74, \ + struct kvm_assigned_msix_entry) +#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) +#define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) +#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) +#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) +#define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd) +#define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) +#define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) +#define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +/* Available with KVM_CAP_PIT_STATE2 */ +#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) +#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) +/* Available with KVM_CAP_PPC_GET_PVINFO */ +#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo) + +/* + * ioctls for vcpu fds + */ +#define KVM_RUN _IO(KVMIO, 0x80) +#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) +#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) +#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs) +#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) +#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) +#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) +/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */ +#define KVM_DEBUG_GUEST __KVM_DEPRECATED_VCPU_W_0x87 +#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) +#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) +#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) +#define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask) +#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu) +#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) +#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) +#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) +#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2) +#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2) +/* Available with KVM_CAP_VAPIC */ +#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl) +/* Available with KVM_CAP_VAPIC */ +#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr) +/* valid for virtual machine (for floating interrupt)_and_ vcpu */ +#define KVM_S390_INTERRUPT _IOW(KVMIO, 0x94, struct kvm_s390_interrupt) +/* store status for s390 */ +#define KVM_S390_STORE_STATUS_NOADDR (-1ul) +#define KVM_S390_STORE_STATUS_PREFIXED (-2ul) +#define KVM_S390_STORE_STATUS _IOW(KVMIO, 0x95, unsigned long) +/* initial ipl psw for s390 */ +#define KVM_S390_SET_INITIAL_PSW _IOW(KVMIO, 0x96, struct kvm_s390_psw) +/* initial reset for s390 */ +#define KVM_S390_INITIAL_RESET _IO(KVMIO, 0x97) +#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state) +#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) +/* Available with KVM_CAP_NMI */ +#define KVM_NMI _IO(KVMIO, 0x9a) +/* Available with KVM_CAP_SET_GUEST_DEBUG */ +#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) +/* MCE for x86 */ +#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64) +#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64) +#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce) +/* IA64 stack access */ +#define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *) +#define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) +/* Available with KVM_CAP_VCPU_EVENTS */ +#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) +#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) +/* Available with KVM_CAP_DEBUGREGS */ +#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) +#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) +#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) +/* Available with KVM_CAP_XSAVE */ +#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) +#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) +/* Available with KVM_CAP_XCRS */ +#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) +#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) + +#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + +struct kvm_assigned_pci_dev { + __u32 assigned_dev_id; + __u32 busnr; + __u32 devfn; + __u32 flags; + __u32 segnr; + union { + __u32 reserved[11]; + }; +}; + +#define KVM_DEV_IRQ_HOST_INTX (1 << 0) +#define KVM_DEV_IRQ_HOST_MSI (1 << 1) +#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) + +#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) +#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) +#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) + +#define KVM_DEV_IRQ_HOST_MASK 0x00ff +#define KVM_DEV_IRQ_GUEST_MASK 0xff00 + +struct kvm_assigned_irq { + __u32 assigned_dev_id; + __u32 host_irq; + __u32 guest_irq; + __u32 flags; + union { + struct { + __u32 addr_lo; + __u32 addr_hi; + __u32 data; + } guest_msi; + __u32 reserved[12]; + }; +}; + + +struct kvm_assigned_msix_nr { + __u32 assigned_dev_id; + __u16 entry_nr; + __u16 padding; +}; + +#define KVM_MAX_MSIX_PER_DEV 256 +struct kvm_assigned_msix_entry { + __u32 assigned_dev_id; + __u32 gsi; + __u16 entry; /* The index of entry in the MSI-X table */ + __u16 padding[3]; +}; + +#endif /* __LINUX_KVM_H */ diff --git a/linux/usr/include/linux/kvm_para.h b/linux/usr/include/linux/kvm_para.h new file mode 100644 index 0000000..b315e27 --- /dev/null +++ b/linux/usr/include/linux/kvm_para.h @@ -0,0 +1,29 @@ +#ifndef __LINUX_KVM_PARA_H +#define __LINUX_KVM_PARA_H + +/* + * This header file provides a method for making a hypercall to the host + * Architectures should define: + * - kvm_hypercall0, kvm_hypercall1... + * - kvm_arch_para_features + * - kvm_para_available + */ + +/* Return values for hypercalls */ +#define KVM_ENOSYS 1000 +#define KVM_EFAULT EFAULT +#define KVM_E2BIG E2BIG +#define KVM_EPERM EPERM + +#define KVM_HC_VAPIC_POLL_IRQ 1 +#define KVM_HC_MMU_OP 2 +#define KVM_HC_FEATURES 3 +#define KVM_HC_PPC_MAP_MAGIC_PAGE 4 + +/* + * hypercalls use architecture specific + */ +#include <asm/kvm_para.h> + +#endif /* __LINUX_KVM_PARA_H */ + diff --git a/linux/x86/assigned-dev.c b/linux/x86/assigned-dev.c new file mode 100644 index 0000000..86dac05 --- /dev/null +++ b/linux/x86/assigned-dev.c @@ -0,0 +1,827 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * Kernel-based Virtual Machine - device assignment support + * + * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/errno.h> +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include "irq.h" + +static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, + int assigned_dev_id) +{ + struct list_head *ptr; + struct kvm_assigned_dev_kernel *match; + + list_for_each(ptr, head) { + match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + if (match->assigned_dev_id == assigned_dev_id) + return match; + } + return NULL; +} + +static int find_index_from_host_irq(struct kvm_assigned_dev_kernel + *assigned_dev, int irq) +{ + int i, index; + struct msix_entry *host_msix_entries; + + host_msix_entries = assigned_dev->host_msix_entries; + + index = -1; + for (i = 0; i < assigned_dev->entries_nr; i++) + if (irq == host_msix_entries[i].vector) { + index = i; + break; + } + if (index < 0) { + printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); + return 0; + } + + return index; +} + +static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + u32 vector; + int index; + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { + spin_lock(&assigned_dev->intx_lock); + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + spin_unlock(&assigned_dev->intx_lock); + } + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + index = find_index_from_host_irq(assigned_dev, irq); + if (index >= 0) { + vector = assigned_dev-> + guest_msix_entries[index].vector; + kvm_set_irq(assigned_dev->kvm, + assigned_dev->irq_source_id, vector, 1); + } + } else + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 1); + + return IRQ_HANDLED; +} + +/* Ack the irq line for an assigned device */ +static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_assigned_dev_kernel *dev; + + if (kian->gsi == -1) + return; + + dev = container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); + + kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); + + /* The guest irq may be shared so this ack may be + * from another device. + */ + spin_lock(&dev->intx_lock); + if (dev->host_irq_disabled) { + enable_irq(dev->host_irq); + dev->host_irq_disabled = false; + } + spin_unlock(&dev->intx_lock); +} + +static void deassign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); + assigned_dev->ack_notifier.gsi = -1; + + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 0); + + if (assigned_dev->irq_source_id != -1) + kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); + assigned_dev->irq_source_id = -1; + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); +} + +/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ +static void deassign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + /* + * We disable irq here to prevent further events. + * + * Notice this maybe result in nested disable if the interrupt type is + * INTx, but it's OK for we are going to free it. + * + * If this function is a part of VM destroy, please ensure that till + * now, the kvm state is still legal for probably we also have to wait + * on a currently running IRQ handler. + */ + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + int i; + for (i = 0; i < assigned_dev->entries_nr; i++) + disable_irq(assigned_dev->host_msix_entries[i].vector); + + for (i = 0; i < assigned_dev->entries_nr; i++) + free_irq(assigned_dev->host_msix_entries[i].vector, + (void *)assigned_dev); + + assigned_dev->entries_nr = 0; + kfree(assigned_dev->host_msix_entries); + kfree(assigned_dev->guest_msix_entries); + pci_disable_msix(assigned_dev->dev); + } else { + /* Deal with MSI and INTx */ + disable_irq(assigned_dev->host_irq); + + free_irq(assigned_dev->host_irq, (void *)assigned_dev); + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) + pci_disable_msi(assigned_dev->dev); + } + + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); +} + +static int kvm_deassign_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev, + unsigned long irq_requested_type) +{ + unsigned long guest_irq_type, host_irq_type; + + if (!irqchip_in_kernel(kvm)) + return -EINVAL; + /* no irq assignment to deassign */ + if (!assigned_dev->irq_requested_type) + return -ENXIO; + + host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; + guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; + + if (host_irq_type) + deassign_host_irq(kvm, assigned_dev); + if (guest_irq_type) + deassign_guest_irq(kvm, assigned_dev); + + return 0; +} + +static void kvm_free_assigned_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); +} + +static void kvm_free_assigned_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + kvm_free_assigned_irq(kvm, assigned_dev); + + __pci_reset_function(assigned_dev->dev); + pci_restore_state(assigned_dev->dev); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); +} + +void kvm_free_all_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + kvm_free_assigned_device(kvm, assigned_dev); + } +} + +static int assigned_device_enable_host_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + dev->host_irq = dev->dev->irq; + /* Even though this is PCI, we don't want to use shared + * interrupts. Sharing host devices with guest-assigned devices + * on the same interrupt line is not a happy situation: there + * are going to be long delays in accepting, acking, etc. + */ + if (kvm_request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, + IRQF_ONESHOT, dev->irq_name, (void *)dev)) + return -EIO; + return 0; +} + +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_host_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int r; + + if (!dev->dev->msi_enabled) { + r = pci_enable_msi(dev->dev); + if (r) + return r; + } + + dev->host_irq = dev->dev->irq; + if (kvm_request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, + 0, dev->irq_name, (void *)dev)) { + pci_disable_msi(dev->dev); + return -EIO; + } + + return 0; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_host_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int i, r = -EINVAL; + + /* host_msix_entries and guest_msix_entries should have been + * initialized */ + if (dev->entries_nr == 0) + return r; + + r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); + if (r) + return r; + + for (i = 0; i < dev->entries_nr; i++) { + r = kvm_request_threaded_irq(dev->host_msix_entries[i].vector, + NULL, kvm_assigned_dev_thread, + 0, dev->irq_name, (void *)dev); + if (r) + goto err; + } + + return 0; +err: + for (i -= 1; i >= 0; i--) + free_irq(dev->host_msix_entries[i].vector, (void *)dev); + pci_disable_msix(dev->dev); + return r; +} + +#endif + +static int assigned_device_enable_guest_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = irq->guest_irq; + return 0; +} + +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_guest_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + dev->host_irq_disabled = false; + return 0; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_guest_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + dev->host_irq_disabled = false; + return 0; +} +#endif + +static int assign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + __u32 host_irq_type) +{ + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) + return r; + + snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", + pci_name(dev->dev)); + + switch (host_irq_type) { + case KVM_DEV_IRQ_HOST_INTX: + r = assigned_device_enable_host_intx(kvm, dev); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_HOST_MSI: + r = assigned_device_enable_host_msi(kvm, dev); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_HOST_MSIX: + r = assigned_device_enable_host_msix(kvm, dev); + break; +#endif + default: + r = -EINVAL; + } + + if (!r) + dev->irq_requested_type |= host_irq_type; + + return r; +} + +static int assign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq, + unsigned long guest_irq_type) +{ + int id; + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) + return r; + + id = kvm_request_irq_source_id(kvm); + if (id < 0) + return id; + + dev->irq_source_id = id; + + switch (guest_irq_type) { + case KVM_DEV_IRQ_GUEST_INTX: + r = assigned_device_enable_guest_intx(kvm, dev, irq); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_GUEST_MSI: + r = assigned_device_enable_guest_msi(kvm, dev, irq); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_GUEST_MSIX: + r = assigned_device_enable_guest_msix(kvm, dev, irq); + break; +#endif + default: + r = -EINVAL; + } + + if (!r) { + dev->irq_requested_type |= guest_irq_type; + kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); + } else + kvm_free_irq_source_id(kvm, dev->irq_source_id); + + return r; +} + +/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ +static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, + struct kvm_assigned_irq *assigned_irq) +{ + int r = -EINVAL; + struct kvm_assigned_dev_kernel *match; + unsigned long host_irq_type, guest_irq_type; + + if (!irqchip_in_kernel(kvm)) + return r; + + mutex_lock(&kvm->lock); + r = -ENODEV; + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); + guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); + + r = -EINVAL; + /* can only assign one type at a time */ + if (hweight_long(host_irq_type) > 1) + goto out; + if (hweight_long(guest_irq_type) > 1) + goto out; + if (host_irq_type == 0 && guest_irq_type == 0) + goto out; + + r = 0; + if (host_irq_type) + r = assign_host_irq(kvm, match, host_irq_type); + if (r) + goto out; + + if (guest_irq_type) + r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = -ENODEV; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + r = kvm_deassign_irq(kvm, match, assigned_irq->flags); +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_assign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0, idx; + struct kvm_assigned_dev_kernel *match; + struct pci_dev *dev; + + mutex_lock(&kvm->lock); + idx = srcu_read_lock(&kvm->srcu); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (match) { + /* device already assigned */ + r = -EEXIST; + goto out; + } + + match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); + if (match == NULL) { + printk(KERN_INFO "%s: Couldn't allocate memory\n", + __func__); + r = -ENOMEM; + goto out; + } + dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, + assigned_dev->busnr, + assigned_dev->devfn); + if (!dev) { + printk(KERN_INFO "%s: host device not found\n", __func__); + r = -EINVAL; + goto out_free; + } + if (pci_enable_device(dev)) { + printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); + r = -EBUSY; + goto out_put; + } + r = pci_request_regions(dev, "kvm_assigned_device"); + if (r) { + printk(KERN_INFO "%s: Could not get access to device regions\n", + __func__); + goto out_disable; + } + + pci_reset_function(dev); + pci_save_state(dev); + + match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_segnr = assigned_dev->segnr; + match->host_busnr = assigned_dev->busnr; + match->host_devfn = assigned_dev->devfn; + match->flags = assigned_dev->flags; + match->dev = dev; + spin_lock_init(&match->intx_lock); + match->irq_source_id = -1; + match->kvm = kvm; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + + list_add(&match->list, &kvm->arch.assigned_dev_head); + + if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { + if (!kvm->arch.iommu_domain) { + r = kvm_iommu_map_guest(kvm); + if (r) + goto out_list_del; + } + r = kvm_assign_device(kvm, match); + if (r) + goto out_list_del; + } + +out: + srcu_read_unlock(&kvm->srcu, idx); + mutex_unlock(&kvm->lock); + return r; +out_list_del: + pci_restore_state(dev); + list_del(&match->list); + pci_release_regions(dev); +out_disable: + pci_disable_device(dev); +out_put: + pci_dev_put(dev); +out_free: + kfree(match); + srcu_read_unlock(&kvm->srcu, idx); + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (!match) { + printk(KERN_INFO "%s: device hasn't been assigned before, " + "so cannot be deassigned\n", __func__); + r = -EINVAL; + goto out; + } + + if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) + kvm_deassign_device(kvm, match); + + kvm_free_assigned_device(kvm, match); + +out: + mutex_unlock(&kvm->lock); + return r; +} + + +#ifdef __KVM_HAVE_MSIX +static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, + struct kvm_assigned_msix_nr *entry_nr) +{ + int r = 0; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry_nr->assigned_dev_id); + if (!adev) { + r = -EINVAL; + goto msix_nr_out; + } + + if (adev->entries_nr == 0) { + adev->entries_nr = entry_nr->entry_nr; + if (adev->entries_nr == 0 || + adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { + r = -EINVAL; + goto msix_nr_out; + } + + adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * + entry_nr->entry_nr, + GFP_KERNEL); + if (!adev->host_msix_entries) { + r = -ENOMEM; + goto msix_nr_out; + } + adev->guest_msix_entries = + kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, + GFP_KERNEL); + if (!adev->guest_msix_entries) { + kfree(adev->host_msix_entries); + r = -ENOMEM; + goto msix_nr_out; + } + } else /* Not allowed set MSI-X number twice */ + r = -EINVAL; +msix_nr_out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, + struct kvm_assigned_msix_entry *entry) +{ + int r = 0, i; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry->assigned_dev_id); + + if (!adev) { + r = -EINVAL; + goto msix_entry_out; + } + + for (i = 0; i < adev->entries_nr; i++) + if (adev->guest_msix_entries[i].vector == 0 || + adev->guest_msix_entries[i].entry == entry->entry) { + adev->guest_msix_entries[i].entry = entry->entry; + adev->guest_msix_entries[i].vector = entry->gsi; + adev->host_msix_entries[i].entry = entry->entry; + break; + } + if (i == adev->entries_nr) { + r = -ENOSPC; + goto msix_entry_out; + } + +msix_entry_out: + mutex_unlock(&kvm->lock); + + return r; +} +#endif + +long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg) +{ + void *argp = (void *)arg; + int r; + + switch (ioctl) { + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } + case KVM_ASSIGN_IRQ: { + r = -EOPNOTSUPP; + break; + } + case KVM_ASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } + case KVM_DEASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } + case KVM_DEASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } +#ifdef KVM_CAP_IRQ_ROUTING + case KVM_SET_GSI_ROUTING: { + struct kvm_irq_routing routing; + struct kvm_irq_routing *urouting; + struct kvm_irq_routing_entry *entries; + + r = -EFAULT; + if (copy_from_user(&routing, argp, sizeof(routing))) + goto out; + r = -EINVAL; + if (routing.nr >= KVM_MAX_IRQ_ROUTES) + goto out; + if (routing.flags) + goto out; + r = -ENOMEM; + entries = vmalloc(routing.nr * sizeof(*entries)); + if (!entries) + goto out; + r = -EFAULT; + urouting = argp; + if (copy_from_user(entries, urouting->entries, + routing.nr * sizeof(*entries))) + goto out_free_irq_routing; + r = kvm_set_irq_routing(kvm, entries, routing.nr, + routing.flags); + out_free_irq_routing: + vfree(entries); + break; + } +#endif /* KVM_CAP_IRQ_ROUTING */ +#ifdef __KVM_HAVE_MSIX + case KVM_ASSIGN_SET_MSIX_NR: { + struct kvm_assigned_msix_nr entry_nr; + r = -EFAULT; + if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) + goto out; + r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); + if (r) + goto out; + break; + } + case KVM_ASSIGN_SET_MSIX_ENTRY: { + struct kvm_assigned_msix_entry entry; + r = -EFAULT; + if (copy_from_user(&entry, argp, sizeof entry)) + goto out; + r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); + if (r) + goto out; + break; + } +#endif + default: + r = -ENOTTY; + break; + } +out: + return r; +} + diff --git a/linux/x86/coalesced_mmio.c b/linux/x86/coalesced_mmio.c new file mode 100644 index 0000000..850773e --- /dev/null +++ b/linux/x86/coalesced_mmio.c @@ -0,0 +1,231 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * KVM coalesced MMIO + * + * Copyright (c) 2008 Bull S.A.S. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * + * Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + */ + +#include "iodev.h" + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/kvm.h> + +#include "coalesced_mmio.h" + +static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_coalesced_mmio_dev, dev); +} + +static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, + gpa_t addr, int len) +{ + struct kvm_coalesced_mmio_zone *zone; + struct kvm_coalesced_mmio_ring *ring; + unsigned avail; + int i; + + /* Are we able to batch it ? */ + + /* last is the first free entry + * check if we don't meet the first used entry + * there is always one unused entry in the buffer + */ + ring = dev->kvm->coalesced_mmio_ring; + avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; + if (avail < KVM_MAX_VCPUS) { + /* full */ + return 0; + } + + /* is it in a batchable area ? */ + + for (i = 0; i < dev->nb_zones; i++) { + zone = &dev->zone[i]; + + /* (addr,len) is fully included in + * (zone->addr, zone->size) + */ + + if (zone->addr <= addr && + addr + len <= zone->addr + zone->size) + return 1; + } + return 0; +} + +static int coalesced_mmio_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *val) +{ + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); + struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; + if (!coalesced_mmio_in_range(dev, addr, len)) + return -EOPNOTSUPP; + + spin_lock(&dev->lock); + + /* copy data in first free entry of the ring */ + + ring->coalesced_mmio[ring->last].phys_addr = addr; + ring->coalesced_mmio[ring->last].len = len; + memcpy(ring->coalesced_mmio[ring->last].data, val, len); + smp_wmb(); + ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; + spin_unlock(&dev->lock); + return 0; +} + +static void coalesced_mmio_destructor(struct kvm_io_device *this) +{ + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); + + kfree(dev); +} + +static const struct kvm_io_device_ops coalesced_mmio_ops = { + .write = coalesced_mmio_write, + .destructor = coalesced_mmio_destructor, +}; + +int kvm_coalesced_mmio_init(struct kvm *kvm) +{ + struct kvm_coalesced_mmio_dev *dev; + struct page *page; + int ret; + + ret = -ENOMEM; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto out_err; + kvm->coalesced_mmio_ring = page_address(page); + + ret = -ENOMEM; + dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); + if (!dev) + goto out_free_page; + spin_lock_init(&dev->lock); + kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); + dev->kvm = kvm; + kvm->coalesced_mmio_dev = dev; + + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) + goto out_free_dev; + + return ret; + +out_free_dev: + kvm->coalesced_mmio_dev = NULL; + kfree(dev); +out_free_page: + kvm->coalesced_mmio_ring = NULL; + __free_page(page); +out_err: + return ret; +} + +void kvm_coalesced_mmio_free(struct kvm *kvm) +{ + if (kvm->coalesced_mmio_ring) + free_page((unsigned long)kvm->coalesced_mmio_ring); +} + +int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone) +{ + struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; + + if (dev == NULL) + return -ENXIO; + + mutex_lock(&kvm->slots_lock); + if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { + mutex_unlock(&kvm->slots_lock); + return -ENOBUFS; + } + + dev->zone[dev->nb_zones] = *zone; + dev->nb_zones++; + + mutex_unlock(&kvm->slots_lock); + return 0; +} + +int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone) +{ + int i; + struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; + struct kvm_coalesced_mmio_zone *z; + + if (dev == NULL) + return -ENXIO; + + mutex_lock(&kvm->slots_lock); + + i = dev->nb_zones; + while (i) { + z = &dev->zone[i - 1]; + + /* unregister all zones + * included in (zone->addr, zone->size) + */ + + if (zone->addr <= z->addr && + z->addr + z->size <= zone->addr + zone->size) { + dev->nb_zones--; + *z = dev->zone[dev->nb_zones]; + } + i--; + } + + mutex_unlock(&kvm->slots_lock); + + return 0; +} diff --git a/linux/x86/coalesced_mmio.h b/linux/x86/coalesced_mmio.h new file mode 100644 index 0000000..8a5959e --- /dev/null +++ b/linux/x86/coalesced_mmio.h @@ -0,0 +1,39 @@ +#ifndef __KVM_COALESCED_MMIO_H__ +#define __KVM_COALESCED_MMIO_H__ + +/* + * KVM coalesced MMIO + * + * Copyright (c) 2008 Bull S.A.S. + * + * Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + */ + +#ifdef CONFIG_KVM_MMIO + +#define KVM_COALESCED_MMIO_ZONE_MAX 100 + +struct kvm_coalesced_mmio_dev { + struct kvm_io_device dev; + struct kvm *kvm; + spinlock_t lock; + int nb_zones; + struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; +}; + +int kvm_coalesced_mmio_init(struct kvm *kvm); +void kvm_coalesced_mmio_free(struct kvm *kvm); +int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone); +int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone); + +#else + +static inline int kvm_coalesced_mmio_init(struct kvm *kvm) { return 0; } +static inline void kvm_coalesced_mmio_free(struct kvm *kvm) { } + +#endif + +#endif diff --git a/linux/x86/emulate.c b/linux/x86/emulate.c new file mode 100644 index 0000000..cffdf3c --- /dev/null +++ b/linux/x86/emulate.c @@ -0,0 +1,3771 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/****************************************************************************** + * emulate.c + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * Linux coding style, mod r/m decoder, segment base fixes, real-mode + * privileged instructions: + * + * Copyright (C) 2006 Qumranet + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#include <linux/kvm_host.h> +#include "kvm_cache_regs.h" +#include <linux/module.h> +#include <asm/kvm_emulate.h> + +#include "x86.h" +#include "tss.h" + +/* + * Opcode effective-address decode tables. + * Note that we only emulate instructions that have at least one memory + * operand (excluding implicit stack references). We assume that stack + * references and instruction fetches will never occur in special memory + * areas that require emulation. So, for example, 'mov <imm>,<reg>' need + * not be handled. + */ + +/* Operand sizes: 8-bit operands or specified/overridden size. */ +#define ByteOp (1<<0) /* 8-bit operands. */ +/* Destination operand type. */ +#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<1) /* Register operand. */ +#define DstMem (3<<1) /* Memory operand. */ +#define DstAcc (4<<1) /* Destination Accumulator */ +#define DstDI (5<<1) /* Destination is in ES:(E)DI */ +#define DstMem64 (6<<1) /* 64bit memory operand */ +#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ +#define DstMask (7<<1) +/* Source operand type. */ +#define SrcNone (0<<4) /* No source operand. */ +#define SrcReg (1<<4) /* Register operand. */ +#define SrcMem (2<<4) /* Memory operand. */ +#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ +#define SrcImm (5<<4) /* Immediate operand. */ +#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ +#define SrcOne (7<<4) /* Implied '1' */ +#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ +#define SrcImmU (9<<4) /* Immediate operand, unsigned */ +#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ +#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ +#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ +#define SrcAcc (0xd<<4) /* Source Accumulator */ +#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */ +#define SrcMask (0xf<<4) +/* Generic ModRM decode. */ +#define ModRM (1<<8) +/* Destination is only written; never read. */ +#define Mov (1<<9) +#define BitOp (1<<10) +#define MemAbs (1<<11) /* Memory operand is absolute displacement */ +#define String (1<<12) /* String instruction (rep capable) */ +#define Stack (1<<13) /* Stack instruction (push/pop) */ +#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ +#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +/* Misc flags */ +#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ +#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ +#define Undefined (1<<25) /* No Such Instruction */ +#define Lock (1<<26) /* lock prefix is allowed for the instruction */ +#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ +#define No64 (1<<28) +/* Source 2 operand type */ +#define Src2None (0<<29) +#define Src2CL (1<<29) +#define Src2ImmByte (2<<29) +#define Src2One (3<<29) +#define Src2Imm (4<<29) +#define Src2Mask (7<<29) + +#define X2(x...) x, x +#define X3(x...) X2(x), x +#define X4(x...) X2(x), X2(x) +#define X5(x...) X4(x), x +#define X6(x...) X4(x), X2(x) +#define X7(x...) X4(x), X3(x) +#define X8(x...) X4(x), X4(x) +#define X16(x...) X8(x), X8(x) + +struct opcode { + u32 flags; + union { + int (*execute)(struct x86_emulate_ctxt *ctxt); + struct opcode *group; + struct group_dual *gdual; + } u; +}; + +struct group_dual { + struct opcode mod012[8]; + struct opcode mod3[8]; +}; + +/* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18) +#define EFLG_VM (1<<17) +#define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14) +#define EFLG_OF (1<<11) +#define EFLG_DF (1<<10) +#define EFLG_IF (1<<9) +#define EFLG_TF (1<<8) +#define EFLG_SF (1<<7) +#define EFLG_ZF (1<<6) +#define EFLG_AF (1<<4) +#define EFLG_PF (1<<2) +#define EFLG_CF (1<<0) + +#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a +#define EFLG_RESERVED_ONE_MASK 2 + +/* + * Instruction emulation: + * Most instructions are emulated directly via a fragment of inline assembly + * code. This allows us to save/restore EFLAGS and thus very easily pick up + * any modified flags. + */ + +#if defined(CONFIG_X86_64) +#define _LO32 "k" /* force 32-bit operand */ +#define _STK "%%rsp" /* stack pointer */ +#elif defined(__i386__) +#define _LO32 "" /* force 32-bit operand */ +#define _STK "%%esp" /* stack pointer */ +#endif + +/* + * These EFLAGS bits are restored from saved value during emulation, and + * any changes are written back to the saved value after emulation. + */ +#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) + +/* Before executing instruction: restore necessary bits in EFLAGS. */ +#define _PRE_EFLAGS(_sav, _msk, _tmp) \ + /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ + "movl %"_sav",%"_LO32 _tmp"; " \ + "push %"_tmp"; " \ + "push %"_tmp"; " \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pushf; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ + "pop %"_tmp"; " \ + "orl %"_LO32 _tmp",("_STK"); " \ + "popf; " \ + "pop %"_sav"; " + +/* After executing instruction: write-back necessary bits in EFLAGS. */ +#define _POST_EFLAGS(_sav, _msk, _tmp) \ + /* _sav |= EFLAGS & _msk; */ \ + "pushf; " \ + "pop %"_tmp"; " \ + "andl %"_msk",%"_LO32 _tmp"; " \ + "orl %"_LO32 _tmp",%"_sav"; " + +#ifdef CONFIG_X86_64 +#define ON64(x) x +#else +#define ON64(x) +#endif + +#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op _suffix " %"_x"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ + "=&r" (_tmp) \ + : _y ((_src).val), "i" (EFLAGS_MASK)); \ + } while (0) + + +/* Raw emulation: instruction has two explicit operands. */ +#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ + break; \ + case 4: \ + ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ + break; \ + case 8: \ + ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ + break; \ + } \ + } while (0) + +#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + switch ((_dst).bytes) { \ + case 1: \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ + break; \ + default: \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + _wx, _wy, _lx, _ly, _qx, _qy); \ + break; \ + } \ + } while (0) + +/* Source operand is byte-sized and may be restricted to just %cl. */ +#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "c", "b", "c", "b", "c", "b", "c") + +/* Source operand is byte, word, long or quad sized. */ +#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "q", "w", "r", _LO32, "r", "", "r") + +/* Source operand is word, long or quad sized. */ +#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + "w", "r", _LO32, "r", "", "r") + +/* Instruction has three operands and one operand is stored in ECX register */ +#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ + do { \ + unsigned long _tmp; \ + _type _clv = (_cl).val; \ + _type _srcv = (_src).val; \ + _type _dstv = (_dst).val; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "2") \ + _op _suffix " %4,%1 \n" \ + _POST_EFLAGS("0", "5", "2") \ + : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ + ); \ + \ + (_cl).val = (unsigned long) _clv; \ + (_src).val = (unsigned long) _srcv; \ + (_dst).val = (unsigned long) _dstv; \ + } while (0) + +#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 2: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "w", unsigned short); \ + break; \ + case 4: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "l", unsigned int); \ + break; \ + case 8: \ + ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "q", unsigned long)); \ + break; \ + } \ + } while (0) + +#define __emulate_1op(_op, _dst, _eflags, _suffix) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op _suffix " %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "+m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + } while (0) + +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ + case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ + case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ + case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ + } \ + } while (0) + +#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "1") \ + _op _suffix " %5; " \ + _POST_EFLAGS("0", "4", "1") \ + : "=m" (_eflags), "=&r" (_tmp), \ + "+a" (_rax), "+d" (_rdx) \ + : "i" (EFLAGS_MASK), "m" ((_src).val), \ + "a" (_rax), "d" (_rdx)); \ + } while (0) + +#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "1") \ + "1: \n\t" \ + _op _suffix " %6; " \ + "2: \n\t" \ + _POST_EFLAGS("0", "5", "1") \ + ".pushsection .fixup,\"ax\" \n\t" \ + "3: movb $1, %4 \n\t" \ + "jmp 2b \n\t" \ + ".popsection \n\t" \ + _ASM_EXTABLE(1b, 3b) \ + : "=m" (_eflags), "=&r" (_tmp), \ + "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ + : "i" (EFLAGS_MASK), "m" ((_src).val), \ + "a" (_rax), "d" (_rdx)); \ + } while (0) + +/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ +#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ + do { \ + switch((_src).bytes) { \ + case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ + case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ + case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ + case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ + } \ + } while (0) + +#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \ + do { \ + switch((_src).bytes) { \ + case 1: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "b", _ex); \ + break; \ + case 2: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "w", _ex); \ + break; \ + case 4: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "l", _ex); \ + break; \ + case 8: ON64( \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "q", _ex)); \ + break; \ + } \ + } while (0) + +/* Fetch next part of the instruction being emulated. */ +#define insn_fetch(_type, _size, _eip) \ +({ unsigned long _x; \ + rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ + if (rc != X86EMUL_CONTINUE) \ + goto done; \ + (_eip) += (_size); \ + (_type)_x; \ +}) + +#define insn_fetch_arr(_arr, _size, _eip) \ +({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ + if (rc != X86EMUL_CONTINUE) \ + goto done; \ + (_eip) += (_size); \ +}) + +static inline unsigned long ad_mask(struct decode_cache *c) +{ + return (1UL << (c->ad_bytes << 3)) - 1; +} + +/* Access/update address held in a register, based on addressing mode. */ +static inline unsigned long +address_mask(struct decode_cache *c, unsigned long reg) +{ + if (c->ad_bytes == sizeof(unsigned long)) + return reg; + else + return reg & ad_mask(c); +} + +static inline unsigned long +register_address(struct decode_cache *c, unsigned long reg) +{ + return address_mask(c, reg); +} + +static inline void +register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) +{ + if (c->ad_bytes == sizeof(unsigned long)) + *reg += inc; + else + *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); +} + +static inline void jmp_rel(struct decode_cache *c, int rel) +{ + register_address_increment(c, &c->eip, rel); +} + +static void set_seg_override(struct decode_cache *c, int seg) +{ + c->has_seg_override = true; + c->seg_override = seg; +} + +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) + return 0; + + return ops->get_cached_segment_base(seg, ctxt->vcpu); +} + +static unsigned seg_override(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct decode_cache *c) +{ + if (!c->has_seg_override) + return 0; + + return c->seg_override; +} + +static ulong linear(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr) +{ + struct decode_cache *c = &ctxt->decode; + ulong la; + + la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; + if (c->ad_bytes != 8) + la &= (u32)-1; + return la; +} + +static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, + u32 error, bool valid) +{ + ctxt->exception.vector = vec; + ctxt->exception.error_code = error; + ctxt->exception.error_code_valid = valid; + return X86EMUL_PROPAGATE_FAULT; +} + +static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) +{ + return emulate_exception(ctxt, GP_VECTOR, err, true); +} + +static int emulate_ud(struct x86_emulate_ctxt *ctxt) +{ + return emulate_exception(ctxt, UD_VECTOR, 0, false); +} + +static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err) +{ + return emulate_exception(ctxt, TS_VECTOR, err, true); +} + +static int emulate_de(struct x86_emulate_ctxt *ctxt) +{ + return emulate_exception(ctxt, DE_VECTOR, 0, false); +} + +static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long eip, u8 *dest) +{ + struct fetch_cache *fc = &ctxt->decode.fetch; + int rc; + int size, cur_size; + + if (eip == fc->end) { + cur_size = fc->end - fc->start; + size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); + rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, + size, ctxt->vcpu, &ctxt->exception); + if (rc != X86EMUL_CONTINUE) + return rc; + fc->end += size; + } + *dest = fc->data[eip - fc->start]; + return X86EMUL_CONTINUE; +} + +static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long eip, void *dest, unsigned size) +{ + int rc; + + /* x86 instructions are limited to 15 bytes. */ + if (eip + size - ctxt->eip > 15) + return X86EMUL_UNHANDLEABLE; + while (size--) { + rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); + if (rc != X86EMUL_CONTINUE) + return rc; + } + return X86EMUL_CONTINUE; +} + +/* + * Given the 'reg' portion of a ModRM byte, and a register block, return a + * pointer into the block that addresses the relevant register. + * @highbyte_regs specifies whether to decode AH,CH,DH,BH. + */ +static void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs) +{ + void *p; + + p = ®s[modrm_reg]; + if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) + p = (unsigned char *)®s[modrm_reg & 3] + 1; + return p; +} + +static int read_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct segmented_address addr, + u16 *size, unsigned long *address, int op_bytes) +{ + int rc; + + if (op_bytes == 2) + op_bytes = 3; + *address = 0; + rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2, + ctxt->vcpu, &ctxt->exception); + if (rc != X86EMUL_CONTINUE) + return rc; + addr.ea += 2; + rc = ops->read_std(linear(ctxt, addr), address, op_bytes, + ctxt->vcpu, &ctxt->exception); + return rc; +} + +static int test_cc(unsigned int condition, unsigned int flags) +{ + int rc = 0; + + switch ((condition & 15) >> 1) { + case 0: /* o */ + rc |= (flags & EFLG_OF); + break; + case 1: /* b/c/nae */ + rc |= (flags & EFLG_CF); + break; + case 2: /* z/e */ + rc |= (flags & EFLG_ZF); + break; + case 3: /* be/na */ + rc |= (flags & (EFLG_CF|EFLG_ZF)); + break; + case 4: /* s */ + rc |= (flags & EFLG_SF); + break; + case 5: /* p/pe */ + rc |= (flags & EFLG_PF); + break; + case 7: /* le/ng */ + rc |= (flags & EFLG_ZF); + /* fall through */ + case 6: /* l/nge */ + rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); + break; + } + + /* Odd condition identifiers (lsb == 1) have inverted sense. */ + return (!!rc ^ (condition & 1)); +} + +static void fetch_register_operand(struct operand *op) +{ + switch (op->bytes) { + case 1: + op->val = *(u8 *)op->addr.reg; + break; + case 2: + op->val = *(u16 *)op->addr.reg; + break; + case 4: + op->val = *(u32 *)op->addr.reg; + break; + case 8: + op->val = *(u64 *)op->addr.reg; + break; + } +} + +static void decode_register_operand(struct operand *op, + struct decode_cache *c, + int inhibit_bytereg) +{ + unsigned reg = c->modrm_reg; + int highbyte_regs = c->rex_prefix == 0; + + if (!(c->d & ModRM)) + reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); + op->type = OP_REG; + if ((c->d & ByteOp) && !inhibit_bytereg) { + op->addr.reg = decode_register(reg, c->regs, highbyte_regs); + op->bytes = 1; + } else { + op->addr.reg = decode_register(reg, c->regs, 0); + op->bytes = c->op_bytes; + } + fetch_register_operand(op); + op->orig_val = op->val; +} + +static int decode_modrm(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct operand *op) +{ + struct decode_cache *c = &ctxt->decode; + u8 sib; + int index_reg = 0, base_reg = 0, scale; + int rc = X86EMUL_CONTINUE; + ulong modrm_ea = 0; + + if (c->rex_prefix) { + c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ + index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ + c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ + } + + c->modrm = insn_fetch(u8, 1, c->eip); + c->modrm_mod |= (c->modrm & 0xc0) >> 6; + c->modrm_reg |= (c->modrm & 0x38) >> 3; + c->modrm_rm |= (c->modrm & 0x07); + c->modrm_seg = VCPU_SREG_DS; + + if (c->modrm_mod == 3) { + op->type = OP_REG; + op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + op->addr.reg = decode_register(c->modrm_rm, + c->regs, c->d & ByteOp); + fetch_register_operand(op); + return rc; + } + + op->type = OP_MEM; + + if (c->ad_bytes == 2) { + unsigned bx = c->regs[VCPU_REGS_RBX]; + unsigned bp = c->regs[VCPU_REGS_RBP]; + unsigned si = c->regs[VCPU_REGS_RSI]; + unsigned di = c->regs[VCPU_REGS_RDI]; + + /* 16-bit ModR/M decode. */ + switch (c->modrm_mod) { + case 0: + if (c->modrm_rm == 6) + modrm_ea += insn_fetch(u16, 2, c->eip); + break; + case 1: + modrm_ea += insn_fetch(s8, 1, c->eip); + break; + case 2: + modrm_ea += insn_fetch(u16, 2, c->eip); + break; + } + switch (c->modrm_rm) { + case 0: + modrm_ea += bx + si; + break; + case 1: + modrm_ea += bx + di; + break; + case 2: + modrm_ea += bp + si; + break; + case 3: + modrm_ea += bp + di; + break; + case 4: + modrm_ea += si; + break; + case 5: + modrm_ea += di; + break; + case 6: + if (c->modrm_mod != 0) + modrm_ea += bp; + break; + case 7: + modrm_ea += bx; + break; + } + if (c->modrm_rm == 2 || c->modrm_rm == 3 || + (c->modrm_rm == 6 && c->modrm_mod != 0)) + c->modrm_seg = VCPU_SREG_SS; + modrm_ea = (u16)modrm_ea; + } else { + /* 32/64-bit ModR/M decode. */ + if ((c->modrm_rm & 7) == 4) { + sib = insn_fetch(u8, 1, c->eip); + index_reg |= (sib >> 3) & 7; + base_reg |= sib & 7; + scale = sib >> 6; + + if ((base_reg & 7) == 5 && c->modrm_mod == 0) + modrm_ea += insn_fetch(s32, 4, c->eip); + else + modrm_ea += c->regs[base_reg]; + if (index_reg != 4) + modrm_ea += c->regs[index_reg] << scale; + } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { + if (ctxt->mode == X86EMUL_MODE_PROT64) + c->rip_relative = 1; + } else + modrm_ea += c->regs[c->modrm_rm]; + switch (c->modrm_mod) { + case 0: + if (c->modrm_rm == 5) + modrm_ea += insn_fetch(s32, 4, c->eip); + break; + case 1: + modrm_ea += insn_fetch(s8, 1, c->eip); + break; + case 2: + modrm_ea += insn_fetch(s32, 4, c->eip); + break; + } + } + op->addr.mem.ea = modrm_ea; +done: + return rc; +} + +static int decode_abs(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct operand *op) +{ + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + + op->type = OP_MEM; + switch (c->ad_bytes) { + case 2: + op->addr.mem.ea = insn_fetch(u16, 2, c->eip); + break; + case 4: + op->addr.mem.ea = insn_fetch(u32, 4, c->eip); + break; + case 8: + op->addr.mem.ea = insn_fetch(u64, 8, c->eip); + break; + } +done: + return rc; +} + +static void fetch_bit_operand(struct decode_cache *c) +{ + long sv = 0, mask; + + if (c->dst.type == OP_MEM && c->src.type == OP_REG) { + mask = ~(c->dst.bytes * 8 - 1); + + if (c->src.bytes == 2) + sv = (s16)c->src.val & (s16)mask; + else if (c->src.bytes == 4) + sv = (s32)c->src.val & (s32)mask; + + c->dst.addr.mem.ea += (sv >> 3); + } + + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; +} + +static int read_emulated(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long addr, void *dest, unsigned size) +{ + int rc; + struct read_cache *mc = &ctxt->decode.mem_read; + + while (size) { + int n = min(size, 8u); + size -= n; + if (mc->pos < mc->end) + goto read_cached; + + rc = ops->read_emulated(addr, mc->data + mc->end, n, + &ctxt->exception, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + mc->end += n; + + read_cached: + memcpy(dest, mc->data + mc->pos, n); + mc->pos += n; + dest += n; + addr += n; + } + return X86EMUL_CONTINUE; +} + +static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned int size, unsigned short port, + void *dest) +{ + struct read_cache *rc = &ctxt->decode.io_read; + + if (rc->pos == rc->end) { /* refill pio read ahead */ + struct decode_cache *c = &ctxt->decode; + unsigned int in_page, n; + unsigned int count = c->rep_prefix ? + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; + in_page = (ctxt->eflags & EFLG_DF) ? + offset_in_page(c->regs[VCPU_REGS_RDI]) : + PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); + n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, + count); + if (n == 0) + n = 1; + rc->pos = rc->end = 0; + if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) + return 0; + rc->end = n * size; + } + + memcpy(dest, rc->data + rc->pos, size); + rc->pos += size; + return 1; +} + +static u32 desc_limit_scaled(struct kvm_desc_struct *desc) +{ + u32 limit = kvm_get_desc_limit(desc); + + return desc->g ? (limit << 12) | 0xfff : limit; +} + +static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct kvm_desc_ptr *dt) +{ + if (selector & 1 << 2) { + struct kvm_desc_struct desc; + memset (dt, 0, sizeof *dt); + if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) + return; + + dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ + dt->address = kvm_get_desc_base(&desc); + } else + ops->get_gdt(dt, ctxt->vcpu); +} + +/* allowed just for 8 bytes segments */ +static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct kvm_desc_struct *desc) +{ + struct kvm_desc_ptr dt; + u16 index = selector >> 3; + int ret; + ulong addr; + + get_descriptor_table_ptr(ctxt, ops, selector, &dt); + + if (dt.size < index * 8 + 7) + return emulate_gp(ctxt, selector & 0xfffc); + addr = dt.address + index * 8; + ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, + &ctxt->exception); + + return ret; +} + +/* allowed just for 8 bytes segments */ +static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct kvm_desc_struct *desc) +{ + struct kvm_desc_ptr dt; + u16 index = selector >> 3; + ulong addr; + int ret; + + get_descriptor_table_ptr(ctxt, ops, selector, &dt); + + if (dt.size < index * 8 + 7) + return emulate_gp(ctxt, selector & 0xfffc); + + addr = dt.address + index * 8; + ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, + &ctxt->exception); + + return ret; +} + +static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, int seg) +{ + struct kvm_desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; + + memset(&seg_desc, 0, sizeof seg_desc); + + if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) + || ctxt->mode == X86EMUL_MODE_REAL) { + /* set real mode segment descriptor */ + kvm_set_desc_base(&seg_desc, selector << 4); + kvm_set_desc_limit(&seg_desc, 0xffff); + seg_desc.type = 3; + seg_desc.p = 1; + seg_desc.s = 1; + goto load; + } + + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + if (null_selector) /* for NULL selector skip all following checks */ + goto load; + + ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !seg_desc.s) + goto exception; + + if (!seg_desc.p) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = seg_desc.dpl; + cpl = ops->cpl(ctxt->vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(seg_desc.type & 8)) + goto exception; + + if (seg_desc.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (seg_desc.s || seg_desc.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((seg_desc.type & 0xa) == 0x8 || + (((seg_desc.type & 0xc) != 0xc) && + (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (seg_desc.s) { + /* mark segment as accessed */ + seg_desc.type |= 1; + ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + } +load: + ops->set_segment_selector(selector, seg, ctxt->vcpu); + ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); + return X86EMUL_CONTINUE; +exception: + emulate_exception(ctxt, err_vec, err_code, true); + return X86EMUL_PROPAGATE_FAULT; +} + +static void write_register_operand(struct operand *op) +{ + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ + switch (op->bytes) { + case 1: + *(u8 *)op->addr.reg = (u8)op->val; + break; + case 2: + *(u16 *)op->addr.reg = (u16)op->val; + break; + case 4: + *op->addr.reg = (u32)op->val; + break; /* 64b: zero-extend */ + case 8: + *op->addr.reg = op->val; + break; + } +} + +static inline int writeback(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int rc; + struct decode_cache *c = &ctxt->decode; + + switch (c->dst.type) { + case OP_REG: + write_register_operand(&c->dst); + break; + case OP_MEM: + if (c->lock_prefix) + rc = ops->cmpxchg_emulated( + linear(ctxt, c->dst.addr.mem), + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes, + &ctxt->exception, + ctxt->vcpu); + else + rc = ops->write_emulated( + linear(ctxt, c->dst.addr.mem), + &c->dst.val, + c->dst.bytes, + &ctxt->exception, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + break; + case OP_NONE: + /* no writeback */ + break; + default: + break; + } + return X86EMUL_CONTINUE; +} + +static inline void emulate_push(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_MEM; + c->dst.bytes = c->op_bytes; + c->dst.val = c->src.val; + register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); + c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]); + c->dst.addr.mem.seg = VCPU_SREG_SS; +} + +static int emulate_pop(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + struct segmented_address addr; + + addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); + addr.seg = VCPU_SREG_SS; + rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); + return rc; +} + +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + int rc; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = ops->cpl(ctxt->vcpu); + + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) + return emulate_gp(ctxt, 0); + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; + } + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; +} + +static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); + + emulate_push(ctxt, ops); +} + +static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + struct decode_cache *c = &ctxt->decode; + unsigned long selector; + int rc; + + rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); + return rc; +} + +static int emulate_pusha(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + unsigned long old_esp = c->regs[VCPU_REGS_RSP]; + int rc = X86EMUL_CONTINUE; + int reg = VCPU_REGS_RAX; + + while (reg <= VCPU_REGS_RDI) { + (reg == VCPU_REGS_RSP) ? + (c->src.val = old_esp) : (c->src.val = c->regs[reg]); + + emulate_push(ctxt, ops); + + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + ++reg; + } + + /* Disable writeback. */ + c->dst.type = OP_NONE; + + return rc; +} + +static int emulate_popa(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + int reg = VCPU_REGS_RDI; + + while (reg >= VCPU_REGS_RAX) { + if (reg == VCPU_REGS_RSP) { + register_address_increment(c, &c->regs[VCPU_REGS_RSP], + c->op_bytes); + --reg; + } + + rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); + if (rc != X86EMUL_CONTINUE) + break; + --reg; + } + return rc; +} + +int emulate_int_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + struct kvm_desc_ptr dt; + gva_t cs_addr; + gva_t eip_addr; + u16 cs, eip; + + /* TODO: Add limit checks */ + c->src.val = ctxt->eflags; + emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); + + c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->src.val = c->eip; + emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.type = OP_NONE; + + ops->get_idt(&dt, ctxt->vcpu); + + eip_addr = dt.address + (irq << 2); + cs_addr = dt.address + (irq << 2) + 2; + + rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = eip; + + return rc; +} + +static int emulate_int(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq) +{ + switch(ctxt->mode) { + case X86EMUL_MODE_REAL: + return emulate_int_real(ctxt, ops, irq); + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT64: + default: + /* Protected mode interrupts unimplemented yet */ + return X86EMUL_UNHANDLEABLE; + } +} + +static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + unsigned long temp_eip = 0; + unsigned long temp_eflags = 0; + unsigned long cs = 0; + unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | + EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | + EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ + unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; + + /* TODO: Add stack limit check */ + + rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + if (temp_eip & ~0xffff) + return emulate_gp(ctxt, 0); + + rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); + + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = temp_eip; + + + if (c->op_bytes == 4) + ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); + else if (c->op_bytes == 2) { + ctxt->eflags &= ~0xffff; + ctxt->eflags |= temp_eflags; + } + + ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ + ctxt->eflags |= EFLG_RESERVED_ONE_MASK; + + return rc; +} + +static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops* ops) +{ + switch(ctxt->mode) { + case X86EMUL_MODE_REAL: + return emulate_iret_real(ctxt, ops); + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT64: + default: + /* iret from protected mode unimplemented yet */ + return X86EMUL_UNHANDLEABLE; + } +} + +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + + return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); +} + +static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + switch (c->modrm_reg) { + case 0: /* rol */ + emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); + break; + case 1: /* ror */ + emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); + break; + case 2: /* rcl */ + emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); + break; + case 3: /* rcr */ + emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); + break; + case 4: /* sal/shl */ + case 6: /* sal/shl */ + emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); + break; + case 5: /* shr */ + emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); + break; + case 7: /* sar */ + emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); + break; + } +} + +static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + unsigned long *rax = &c->regs[VCPU_REGS_RAX]; + unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; + u8 de = 0; + + switch (c->modrm_reg) { + case 0 ... 1: /* test */ + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + break; + case 2: /* not */ + c->dst.val = ~c->dst.val; + break; + case 3: /* neg */ + emulate_1op("neg", c->dst, ctxt->eflags); + break; + case 4: /* mul */ + emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); + break; + case 5: /* imul */ + emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); + break; + case 6: /* div */ + emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, + ctxt->eflags, de); + break; + case 7: /* idiv */ + emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, + ctxt->eflags, de); + break; + default: + return X86EMUL_UNHANDLEABLE; + } + if (de) + return emulate_de(ctxt); + return X86EMUL_CONTINUE; +} + +static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + + switch (c->modrm_reg) { + case 0: /* inc */ + emulate_1op("inc", c->dst, ctxt->eflags); + break; + case 1: /* dec */ + emulate_1op("dec", c->dst, ctxt->eflags); + break; + case 2: /* call near abs */ { + long int old_eip; + old_eip = c->eip; + c->eip = c->src.val; + c->src.val = old_eip; + emulate_push(ctxt, ops); + break; + } + case 4: /* jmp abs */ + c->eip = c->src.val; + break; + case 6: /* push */ + emulate_push(ctxt, ops); + break; + } + return X86EMUL_CONTINUE; +} + +static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + u64 old = c->dst.orig_val64; + + if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || + ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { + c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); + c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); + ctxt->eflags &= ~EFLG_ZF; + } else { + c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) | + (u32) c->regs[VCPU_REGS_RBX]; + + ctxt->eflags |= EFLG_ZF; + } + return X86EMUL_CONTINUE; +} + +static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + unsigned long cs; + + rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + if (c->op_bytes == 4) + c->eip = (u32)c->eip; + rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); + return rc; +} + +static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + struct decode_cache *c = &ctxt->decode; + unsigned short sel; + int rc; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + rc = load_segment_descriptor(ctxt, ops, sel, seg); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.val = c->src.val; + return rc; +} + +static inline void +setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, struct kvm_desc_struct *cs, + struct kvm_desc_struct *ss) +{ + memset(cs, 0, sizeof(struct kvm_desc_struct)); + ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); + memset(ss, 0, sizeof(struct kvm_desc_struct)); + + cs->l = 0; /* will be adjusted later */ + kvm_set_desc_base(cs, 0); /* flat segment */ + cs->g = 1; /* 4kb granularity */ + kvm_set_desc_limit(cs, 0xfffff); /* 4GB limit */ + cs->type = 0x0b; /* Read, Execute, Accessed */ + cs->s = 1; + cs->dpl = 0; /* will be adjusted later */ + cs->p = 1; + cs->d = 1; + + kvm_set_desc_base(ss, 0); /* flat segment */ + kvm_set_desc_limit(ss, 0xfffff); /* 4GB limit */ + ss->g = 1; /* 4kb granularity */ + ss->s = 1; + ss->type = 0x03; /* Read/Write, Accessed */ + ss->d = 1; /* 32bit stack segment */ + ss->dpl = 0; + ss->p = 1; +} + +static int +emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_desc_struct cs, ss; + u64 msr_data; + u16 cs_sel, ss_sel; + + /* syscall is not available in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) + return emulate_ud(ctxt); + + setup_syscalls_segments(ctxt, ops, &cs, &ss); + + ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + msr_data >>= 32; + cs_sel = (u16)(msr_data & 0xfffc); + ss_sel = (u16)(msr_data + 8); + + if (is_long_mode(ctxt->vcpu)) { + cs.d = 0; + cs.l = 1; + } + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + + c->regs[VCPU_REGS_RCX] = c->eip; + if (is_long_mode(ctxt->vcpu)) { +#ifdef CONFIG_X86_64 + c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; + + ops->get_msr(ctxt->vcpu, + ctxt->mode == X86EMUL_MODE_PROT64 ? + MSR_LSTAR : MSR_CSTAR, &msr_data); + c->eip = msr_data; + + ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ctxt->eflags &= ~(msr_data | EFLG_RF); +#endif + } else { + /* legacy mode */ + ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + c->eip = (u32)msr_data; + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + } + + return X86EMUL_CONTINUE; +} + +static int +emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_desc_struct cs, ss; + u64 msr_data; + u16 cs_sel, ss_sel; + + /* inject #GP if in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL) + return emulate_gp(ctxt, 0); + + /* XXX sysenter/sysexit have not been tested in 64bit mode. + * Therefore, we inject an #UD. + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + return emulate_ud(ctxt); + + setup_syscalls_segments(ctxt, ops, &cs, &ss); + + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (ctxt->mode) { + case X86EMUL_MODE_PROT32: + if ((msr_data & 0xfffc) == 0x0) + return emulate_gp(ctxt, 0); + break; + case X86EMUL_MODE_PROT64: + if (msr_data == 0x0) + return emulate_gp(ctxt, 0); + break; + } + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + cs_sel = (u16)msr_data; + cs_sel &= ~SELECTOR_RPL_MASK; + ss_sel = cs_sel + 8; + ss_sel &= ~SELECTOR_RPL_MASK; + if (ctxt->mode == X86EMUL_MODE_PROT64 + || is_long_mode(ctxt->vcpu)) { + cs.d = 0; + cs.l = 1; + } + + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + c->eip = msr_data; + + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + c->regs[VCPU_REGS_RSP] = msr_data; + + return X86EMUL_CONTINUE; +} + +static int +emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_desc_struct cs, ss; + u64 msr_data; + int usermode; + u16 cs_sel, ss_sel; + + /* inject #GP if in real mode or Virtual 8086 mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) + return emulate_gp(ctxt, 0); + + setup_syscalls_segments(ctxt, ops, &cs, &ss); + + if ((c->rex_prefix & 0x8) != 0x0) + usermode = X86EMUL_MODE_PROT64; + else + usermode = X86EMUL_MODE_PROT32; + + cs.dpl = 3; + ss.dpl = 3; + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (usermode) { + case X86EMUL_MODE_PROT32: + cs_sel = (u16)(msr_data + 16); + if ((msr_data & 0xfffc) == 0x0) + return emulate_gp(ctxt, 0); + ss_sel = (u16)(msr_data + 24); + break; + case X86EMUL_MODE_PROT64: + cs_sel = (u16)(msr_data + 32); + if (msr_data == 0x0) + return emulate_gp(ctxt, 0); + ss_sel = cs_sel + 8; + cs.d = 0; + cs.l = 1; + break; + } + cs_sel |= SELECTOR_RPL_MASK; + ss_sel |= SELECTOR_RPL_MASK; + + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + + c->eip = c->regs[VCPU_REGS_RDX]; + c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; + + return X86EMUL_CONTINUE; +} + +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int iopl; + if (ctxt->mode == X86EMUL_MODE_REAL) + return false; + if (ctxt->mode == X86EMUL_MODE_VM86) + return true; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + return ops->cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + struct kvm_desc_struct tr_seg; + int r; + u16 io_bitmap_ptr; + u8 perm, bit_idx = port & 0x7; + unsigned mask = (1 << len) - 1; + + ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); + if (!tr_seg.p) + return false; + if (desc_limit_scaled(&tr_seg) < 103) + return false; + r = ops->read_std(kvm_get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, + ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) + return false; + r = ops->read_std(kvm_get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, + &perm, 1, ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if ((perm >> bit_idx) & mask) + return false; + return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + if (ctxt->perm_ok) + return true; + + if (emulator_bad_iopl(ctxt, ops)) + if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + return false; + + ctxt->perm_ok = true; + + return true; +} + +static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_16 *tss) +{ + struct decode_cache *c = &ctxt->decode; + + tss->ip = c->eip; + tss->flag = ctxt->eflags; + tss->ax = c->regs[VCPU_REGS_RAX]; + tss->cx = c->regs[VCPU_REGS_RCX]; + tss->dx = c->regs[VCPU_REGS_RDX]; + tss->bx = c->regs[VCPU_REGS_RBX]; + tss->sp = c->regs[VCPU_REGS_RSP]; + tss->bp = c->regs[VCPU_REGS_RBP]; + tss->si = c->regs[VCPU_REGS_RSI]; + tss->di = c->regs[VCPU_REGS_RDI]; + + tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); + tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); + tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); + tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); +} + +static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_16 *tss) +{ + struct decode_cache *c = &ctxt->decode; + int ret; + + c->eip = tss->ip; + ctxt->eflags = tss->flag | 2; + c->regs[VCPU_REGS_RAX] = tss->ax; + c->regs[VCPU_REGS_RCX] = tss->cx; + c->regs[VCPU_REGS_RDX] = tss->dx; + c->regs[VCPU_REGS_RBX] = tss->bx; + c->regs[VCPU_REGS_RSP] = tss->sp; + c->regs[VCPU_REGS_RBP] = tss->bp; + c->regs[VCPU_REGS_RSI] = tss->si; + c->regs[VCPU_REGS_RDI] = tss->di; + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); + ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); + ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); + if (ret != X86EMUL_CONTINUE) + return ret; + + return X86EMUL_CONTINUE; +} + +static int task_switch_16(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, u16 old_tss_sel, + ulong old_tss_base, struct kvm_desc_struct *new_desc) +{ + struct tss_segment_16 tss_seg; + int ret; + u32 new_tss_base = kvm_get_desc_base(new_desc); + + ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ + return ret; + + save_state_to_tss16(ctxt, ops, &tss_seg); + + ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ + return ret; + + ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ + return ret; + + if (old_tss_sel != 0xffff) { + tss_seg.prev_task_link = old_tss_sel; + + ret = ops->write_std(new_tss_base, + &tss_seg.prev_task_link, + sizeof tss_seg.prev_task_link, + ctxt->vcpu, &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ + return ret; + } + + return load_state_from_tss16(ctxt, ops, &tss_seg); +} + +static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_32 *tss) +{ + struct decode_cache *c = &ctxt->decode; + + tss->cr3 = ops->get_cr(3, ctxt->vcpu); + tss->eip = c->eip; + tss->eflags = ctxt->eflags; + tss->eax = c->regs[VCPU_REGS_RAX]; + tss->ecx = c->regs[VCPU_REGS_RCX]; + tss->edx = c->regs[VCPU_REGS_RDX]; + tss->ebx = c->regs[VCPU_REGS_RBX]; + tss->esp = c->regs[VCPU_REGS_RSP]; + tss->ebp = c->regs[VCPU_REGS_RBP]; + tss->esi = c->regs[VCPU_REGS_RSI]; + tss->edi = c->regs[VCPU_REGS_RDI]; + + tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); + tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); + tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); + tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); + tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); + tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); +} + +static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_32 *tss) +{ + struct decode_cache *c = &ctxt->decode; + int ret; + + if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) + return emulate_gp(ctxt, 0); + c->eip = tss->eip; + ctxt->eflags = tss->eflags | 2; + c->regs[VCPU_REGS_RAX] = tss->eax; + c->regs[VCPU_REGS_RCX] = tss->ecx; + c->regs[VCPU_REGS_RDX] = tss->edx; + c->regs[VCPU_REGS_RBX] = tss->ebx; + c->regs[VCPU_REGS_RSP] = tss->esp; + c->regs[VCPU_REGS_RBP] = tss->ebp; + c->regs[VCPU_REGS_RSI] = tss->esi; + c->regs[VCPU_REGS_RDI] = tss->edi; + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); + ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); + ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); + ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); + ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS); + if (ret != X86EMUL_CONTINUE) + return ret; + + return X86EMUL_CONTINUE; +} + +static int task_switch_32(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, u16 old_tss_sel, + ulong old_tss_base, struct kvm_desc_struct *new_desc) +{ + struct tss_segment_32 tss_seg; + int ret; + u32 new_tss_base = kvm_get_desc_base(new_desc); + + ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ + return ret; + + save_state_to_tss32(ctxt, ops, &tss_seg); + + ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ + return ret; + + ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ + return ret; + + if (old_tss_sel != 0xffff) { + tss_seg.prev_task_link = old_tss_sel; + + ret = ops->write_std(new_tss_base, + &tss_seg.prev_task_link, + sizeof tss_seg.prev_task_link, + ctxt->vcpu, &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ + return ret; + } + + return load_state_from_tss32(ctxt, ops, &tss_seg); +} + +static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, int reason, + bool has_error_code, u32 error_code) +{ + struct kvm_desc_struct curr_tss_desc, next_tss_desc; + int ret; + u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); + ulong old_tss_base = + ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); + u32 desc_limit; + + /* FIXME: old_tss_base == ~0 ? */ + + ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + /* FIXME: check that next_tss_desc is tss */ + + if (reason != TASK_SWITCH_IRET) { + if ((tss_selector & 3) > next_tss_desc.dpl || + ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) + return emulate_gp(ctxt, 0); + } + + desc_limit = desc_limit_scaled(&next_tss_desc); + if (!next_tss_desc.p || + ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || + desc_limit < 0x2b)) { + emulate_ts(ctxt, tss_selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; + } + + if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { + curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ + write_segment_descriptor(ctxt, ops, old_tss_sel, + &curr_tss_desc); + } + + if (reason == TASK_SWITCH_IRET) + ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; + + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + if (next_tss_desc.type & 8) + ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel, + old_tss_base, &next_tss_desc); + else + ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel, + old_tss_base, &next_tss_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) + ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT; + + if (reason != TASK_SWITCH_IRET) { + next_tss_desc.type |= (1 << 1); /* set busy flag */ + write_segment_descriptor(ctxt, ops, tss_selector, + &next_tss_desc); + } + + ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); + ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); + ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); + + if (has_error_code) { + struct decode_cache *c = &ctxt->decode; + + c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; + c->lock_prefix = 0; + c->src.val = (unsigned long) error_code; + emulate_push(ctxt, ops); + } + + return ret; +} + +int emulator_task_switch(struct x86_emulate_ctxt *ctxt, + u16 tss_selector, int reason, + bool has_error_code, u32 error_code) +{ + struct x86_emulate_ops *ops = ctxt->ops; + struct decode_cache *c = &ctxt->decode; + int rc; + + c->eip = ctxt->eip; + c->dst.type = OP_NONE; + + rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, + has_error_code, error_code); + + if (rc == X86EMUL_CONTINUE) { + rc = writeback(ctxt, ops); + if (rc == X86EMUL_CONTINUE) + ctxt->eip = c->eip; + } + + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + +static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, + int reg, struct operand *op) +{ + struct decode_cache *c = &ctxt->decode; + int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; + + register_address_increment(c, &c->regs[reg], df * op->bytes); + op->addr.mem.ea = register_address(c, c->regs[reg]); + op->addr.mem.seg = seg; +} + +static int em_push(struct x86_emulate_ctxt *ctxt) +{ + emulate_push(ctxt, ctxt->ops); + return X86EMUL_CONTINUE; +} + +static int em_das(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u8 al, old_al; + bool af, cf, old_cf; + + cf = ctxt->eflags & X86_EFLAGS_CF; + al = c->dst.val; + + old_al = al; + old_cf = cf; + cf = false; + af = ctxt->eflags & X86_EFLAGS_AF; + if ((al & 0x0f) > 9 || af) { + al -= 6; + cf = old_cf | (al >= 250); + af = true; + } else { + af = false; + } + if (old_al > 0x99 || old_cf) { + al -= 0x60; + cf = true; + } + + c->dst.val = al; + /* Set PF, ZF, SF */ + c->src.type = OP_IMM; + c->src.val = 0; + c->src.bytes = 1; + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); + if (cf) + ctxt->eflags |= X86_EFLAGS_CF; + if (af) + ctxt->eflags |= X86_EFLAGS_AF; + return X86EMUL_CONTINUE; +} + +static int em_call_far(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u16 sel, old_cs; + ulong old_eip; + int rc; + + old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + old_eip = c->eip; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) + return X86EMUL_CONTINUE; + + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); + + c->src.val = old_cs; + emulate_push(ctxt, ctxt->ops); + rc = writeback(ctxt, ctxt->ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->src.val = old_eip; + emulate_push(ctxt, ctxt->ops); + rc = writeback(ctxt, ctxt->ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.type = OP_NONE; + + return X86EMUL_CONTINUE; +} + +static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + c->dst.type = OP_REG; + c->dst.addr.reg = &c->eip; + c->dst.bytes = c->op_bytes; + rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); + return X86EMUL_CONTINUE; +} + +static int em_imul(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_imul_3op(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.val = c->src2.val; + return em_imul(ctxt); +} + +static int em_cwd(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_REG; + c->dst.bytes = c->src.bytes; + c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; + c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1); + + return X86EMUL_CONTINUE; +} + +static int em_rdtsc(struct x86_emulate_ctxt *ctxt) +{ + unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); + struct decode_cache *c = &ctxt->decode; + u64 tsc = 0; + + if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) + return emulate_gp(ctxt, 0); + ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); + c->regs[VCPU_REGS_RAX] = (u32)tsc; + c->regs[VCPU_REGS_RDX] = tsc >> 32; + return X86EMUL_CONTINUE; +} + +static int em_mov(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + c->dst.val = c->src.val; + return X86EMUL_CONTINUE; +} + +#define D(_y) { .flags = (_y) } +#define N D(0) +#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } +#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } +#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } + +#define D2bv(_f) D((_f) | ByteOp), D(_f) +#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) + +#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ + D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ + D2bv(((_f) & ~Lock) | DstAcc | SrcImm) + + +static struct opcode group1[] = { + X7(D(Lock)), N +}; + +static struct opcode group1A[] = { + D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, +}; + +static struct opcode group3[] = { + D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + X4(D(SrcMem | ModRM)), +}; + +static struct opcode group4[] = { + D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), + N, N, N, N, N, N, +}; + +static struct opcode group5[] = { + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + D(SrcMem | ModRM | Stack), + I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), + D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), + D(SrcMem | ModRM | Stack), N, +}; + +static struct group_dual group7 = { { + N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), + D(SrcMem | ModRM | ByteOp | Priv | NoAccess), +}, { + D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), N, +} }; + +static struct opcode group8[] = { + N, N, N, N, + D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), + D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), +}; + +static struct group_dual group9 = { { + N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, +}, { + N, N, N, N, N, N, N, N, +} }; + +static struct opcode group11[] = { + I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), +}; + +static struct opcode opcode_table[256] = { + /* 0x00 - 0x07 */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x08 - 0x0F */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), N, + /* 0x10 - 0x17 */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x18 - 0x1F */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x20 - 0x27 */ + D6ALU(Lock), N, N, + /* 0x28 - 0x2F */ + D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), + /* 0x30 - 0x37 */ + D6ALU(Lock), N, N, + /* 0x38 - 0x3F */ + D6ALU(0), N, N, + /* 0x40 - 0x4F */ + X16(D(DstReg)), + /* 0x50 - 0x57 */ + X8(I(SrcReg | Stack, em_push)), + /* 0x58 - 0x5F */ + X8(D(DstReg | Stack)), + /* 0x60 - 0x67 */ + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , + N, N, N, N, + /* 0x68 - 0x6F */ + I(SrcImm | Mov | Stack, em_push), + I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), + I(SrcImmByte | Mov | Stack, em_push), + I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), + D2bv(DstDI | Mov | String), /* insb, insw/insd */ + D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ + /* 0x70 - 0x7F */ + X16(D(SrcImmByte)), + /* 0x80 - 0x87 */ + G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), + G(DstMem | SrcImm | ModRM | Group, group1), + G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), + G(DstMem | SrcImmByte | ModRM | Group, group1), + D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), + /* 0x88 - 0x8F */ + I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), + I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), + D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), + D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), + /* 0x90 - 0x97 */ + X8(D(SrcAcc | DstReg)), + /* 0x98 - 0x9F */ + D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), + I(SrcImmFAddr | No64, em_call_far), N, + D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, + /* 0xA0 - 0xA7 */ + I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), + I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), + I2bv(SrcSI | DstDI | Mov | String, em_mov), + D2bv(SrcSI | DstDI | String), + /* 0xA8 - 0xAF */ + D2bv(DstAcc | SrcImm), + I2bv(SrcAcc | DstDI | Mov | String, em_mov), + I2bv(SrcSI | DstAcc | Mov | String, em_mov), + D2bv(SrcAcc | DstDI | String), + /* 0xB0 - 0xB7 */ + X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), + /* 0xB8 - 0xBF */ + X8(I(DstReg | SrcImm | Mov, em_mov)), + /* 0xC0 - 0xC7 */ + D2bv(DstMem | SrcImmByte | ModRM), + I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), + D(ImplicitOps | Stack), + D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), + G(ByteOp, group11), G(0, group11), + /* 0xC8 - 0xCF */ + N, N, N, D(ImplicitOps | Stack), + D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), + /* 0xD0 - 0xD7 */ + D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), + N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + X4(D(SrcImmByte)), + D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), + /* 0xE8 - 0xEF */ + D(SrcImm | Stack), D(SrcImm | ImplicitOps), + D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), + D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), + /* 0xF0 - 0xF7 */ + N, N, N, N, + D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), + /* 0xF8 - 0xFF */ + D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), + D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), +}; + +static struct opcode twobyte_table[256] = { + /* 0x00 - 0x0F */ + N, GD(0, &group7), N, N, + N, D(ImplicitOps), D(ImplicitOps | Priv), N, + D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, + N, D(ImplicitOps | ModRM), N, N, + /* 0x10 - 0x1F */ + N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, + /* 0x20 - 0x2F */ + D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), + D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), + N, N, N, N, + N, N, N, N, N, N, N, N, + /* 0x30 - 0x3F */ + D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), + D(ImplicitOps | Priv), N, + D(ImplicitOps), D(ImplicitOps | Priv), N, N, + N, N, N, N, N, N, N, N, + /* 0x40 - 0x4F */ + X16(D(DstReg | SrcMem | ModRM | Mov)), + /* 0x50 - 0x5F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x60 - 0x6F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x70 - 0x7F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x80 - 0x8F */ + X16(D(SrcImm)), + /* 0x90 - 0x9F */ + X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), + /* 0xA0 - 0xA7 */ + D(ImplicitOps | Stack), D(ImplicitOps | Stack), + N, D(DstMem | SrcReg | ModRM | BitOp), + D(DstMem | SrcReg | Src2ImmByte | ModRM), + D(DstMem | SrcReg | Src2CL | ModRM), N, N, + /* 0xA8 - 0xAF */ + D(ImplicitOps | Stack), D(ImplicitOps | Stack), + N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstMem | SrcReg | Src2ImmByte | ModRM), + D(DstMem | SrcReg | Src2CL | ModRM), + D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), + /* 0xB0 - 0xB7 */ + D2bv(DstMem | SrcReg | ModRM | Lock), + D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), + D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + /* 0xB8 - 0xBF */ + N, N, + G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + /* 0xC0 - 0xCF */ + D2bv(DstMem | SrcReg | ModRM | Lock), + N, D(DstMem | SrcReg | ModRM | Mov), + N, N, N, GD(0, &group9), + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xDF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0xE0 - 0xEF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0xF0 - 0xFF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N +}; + +#undef D +#undef N +#undef G +#undef GD +#undef I + +#undef D2bv +#undef I2bv +#undef D6ALU + +static unsigned imm_size(struct decode_cache *c) +{ + unsigned size; + + size = (c->d & ByteOp) ? 1 : c->op_bytes; + if (size == 8) + size = 4; + return size; +} + +static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, + unsigned size, bool sign_extension) +{ + struct decode_cache *c = &ctxt->decode; + struct x86_emulate_ops *ops = ctxt->ops; + int rc = X86EMUL_CONTINUE; + + op->type = OP_IMM; + op->bytes = size; + op->addr.mem.ea = c->eip; + /* NB. Immediates are sign-extended as necessary. */ + switch (op->bytes) { + case 1: + op->val = insn_fetch(s8, 1, c->eip); + break; + case 2: + op->val = insn_fetch(s16, 2, c->eip); + break; + case 4: + op->val = insn_fetch(s32, 4, c->eip); + break; + } + if (!sign_extension) { + switch (op->bytes) { + case 1: + op->val &= 0xff; + break; + case 2: + op->val &= 0xffff; + break; + case 4: + op->val &= 0xffffffff; + break; + } + } +done: + return rc; +} + +int +x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) +{ + struct x86_emulate_ops *ops = ctxt->ops; + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + int mode = ctxt->mode; + int def_op_bytes, def_ad_bytes, dual, goffset; + struct opcode opcode, *g_mod012, *g_mod3; + struct operand memop = { .type = OP_NONE }; + + c->eip = ctxt->eip; + c->fetch.start = c->eip; + c->fetch.end = c->fetch.start + insn_len; + if (insn_len > 0) + memcpy(c->fetch.data, insn, insn_len); + ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + def_op_bytes = def_ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + def_op_bytes = def_ad_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86EMUL_MODE_PROT64: + def_op_bytes = 4; + def_ad_bytes = 8; + break; +#endif + default: + return -1; + } + + c->op_bytes = def_op_bytes; + c->ad_bytes = def_ad_bytes; + + /* Legacy prefixes. */ + for (;;) { + switch (c->b = insn_fetch(u8, 1, c->eip)) { + case 0x66: /* operand-size override */ + /* switch between 2/4 bytes */ + c->op_bytes = def_op_bytes ^ 6; + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + /* switch between 4/8 bytes */ + c->ad_bytes = def_ad_bytes ^ 12; + else + /* switch between 2/4 bytes */ + c->ad_bytes = def_ad_bytes ^ 6; + break; + case 0x26: /* ES override */ + case 0x2e: /* CS override */ + case 0x36: /* SS override */ + case 0x3e: /* DS override */ + set_seg_override(c, (c->b >> 3) & 3); + break; + case 0x64: /* FS override */ + case 0x65: /* GS override */ + set_seg_override(c, c->b & 7); + break; + case 0x40 ... 0x4f: /* REX */ + if (mode != X86EMUL_MODE_PROT64) + goto done_prefixes; + c->rex_prefix = c->b; + continue; + case 0xf0: /* LOCK */ + c->lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + c->rep_prefix = REPNE_PREFIX; + break; + case 0xf3: /* REP/REPE/REPZ */ + c->rep_prefix = REPE_PREFIX; + break; + default: + goto done_prefixes; + } + + /* Any legacy prefix after a REX prefix nullifies its effect. */ + + c->rex_prefix = 0; + } + +done_prefixes: + + /* REX prefix. */ + if (c->rex_prefix & 8) + c->op_bytes = 8; /* REX.W */ + + /* Opcode byte(s). */ + opcode = opcode_table[c->b]; + /* Two-byte opcode? */ + if (c->b == 0x0f) { + c->twobyte = 1; + c->b = insn_fetch(u8, 1, c->eip); + opcode = twobyte_table[c->b]; + } + c->d = opcode.flags; + + if (c->d & Group) { + dual = c->d & GroupDual; + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + + if (c->d & GroupDual) { + g_mod012 = opcode.u.gdual->mod012; + g_mod3 = opcode.u.gdual->mod3; + } else + g_mod012 = g_mod3 = opcode.u.group; + + c->d &= ~(Group | GroupDual); + + goffset = (c->modrm >> 3) & 7; + + if ((c->modrm >> 6) == 3) + opcode = g_mod3[goffset]; + else + opcode = g_mod012[goffset]; + c->d |= opcode.flags; + } + + c->execute = opcode.u.execute; + + /* Unrecognised? */ + if (c->d == 0 || (c->d & Undefined)) + return -1; + + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) + c->op_bytes = 8; + + if (c->d & Op3264) { + if (mode == X86EMUL_MODE_PROT64) + c->op_bytes = 8; + else + c->op_bytes = 4; + } + + /* ModRM and SIB bytes. */ + if (c->d & ModRM) { + rc = decode_modrm(ctxt, ops, &memop); + if (!c->has_seg_override) + set_seg_override(c, c->modrm_seg); + } else if (c->d & MemAbs) + rc = decode_abs(ctxt, ops, &memop); + if (rc != X86EMUL_CONTINUE) + goto done; + + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_DS); + + memop.addr.mem.seg = seg_override(ctxt, ops, c); + + if (memop.type == OP_MEM && c->ad_bytes != 8) + memop.addr.mem.ea = (u32)memop.addr.mem.ea; + + if (memop.type == OP_MEM && c->rip_relative) + memop.addr.mem.ea += c->eip; + + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (c->d & SrcMask) { + case SrcNone: + break; + case SrcReg: + decode_register_operand(&c->src, c, 0); + break; + case SrcMem16: + memop.bytes = 2; + goto srcmem_common; + case SrcMem32: + memop.bytes = 4; + goto srcmem_common; + case SrcMem: + memop.bytes = (c->d & ByteOp) ? 1 : + c->op_bytes; + srcmem_common: + c->src = memop; + break; + case SrcImmU16: + rc = decode_imm(ctxt, &c->src, 2, false); + break; + case SrcImm: + rc = decode_imm(ctxt, &c->src, imm_size(c), true); + break; + case SrcImmU: + rc = decode_imm(ctxt, &c->src, imm_size(c), false); + break; + case SrcImmByte: + rc = decode_imm(ctxt, &c->src, 1, true); + break; + case SrcImmUByte: + rc = decode_imm(ctxt, &c->src, 1, false); + break; + case SrcAcc: + c->src.type = OP_REG; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; + fetch_register_operand(&c->src); + break; + case SrcOne: + c->src.bytes = 1; + c->src.val = 1; + break; + case SrcSI: + c->src.type = OP_MEM; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.addr.mem.ea = + register_address(c, c->regs[VCPU_REGS_RSI]); + c->src.addr.mem.seg = seg_override(ctxt, ops, c), + c->src.val = 0; + break; + case SrcImmFAddr: + c->src.type = OP_IMM; + c->src.addr.mem.ea = c->eip; + c->src.bytes = c->op_bytes + 2; + insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); + break; + case SrcMemFAddr: + memop.bytes = c->op_bytes + 2; + goto srcmem_common; + break; + } + + if (rc != X86EMUL_CONTINUE) + goto done; + + /* + * Decode and fetch the second source operand: register, memory + * or immediate. + */ + switch (c->d & Src2Mask) { + case Src2None: + break; + case Src2CL: + c->src2.bytes = 1; + c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; + break; + case Src2ImmByte: + rc = decode_imm(ctxt, &c->src2, 1, true); + break; + case Src2One: + c->src2.bytes = 1; + c->src2.val = 1; + break; + case Src2Imm: + rc = decode_imm(ctxt, &c->src2, imm_size(c), true); + break; + } + + if (rc != X86EMUL_CONTINUE) + goto done; + + /* Decode and fetch the destination operand: register or memory. */ + switch (c->d & DstMask) { + case DstReg: + decode_register_operand(&c->dst, c, + c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); + break; + case DstImmUByte: + c->dst.type = OP_IMM; + c->dst.addr.mem.ea = c->eip; + c->dst.bytes = 1; + c->dst.val = insn_fetch(u8, 1, c->eip); + break; + case DstMem: + case DstMem64: + c->dst = memop; + if ((c->d & DstMask) == DstMem64) + c->dst.bytes = 8; + else + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->d & BitOp) + fetch_bit_operand(c); + c->dst.orig_val = c->dst.val; + break; + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; + fetch_register_operand(&c->dst); + c->dst.orig_val = c->dst.val; + break; + case DstDI: + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.addr.mem.ea = + register_address(c, c->regs[VCPU_REGS_RDI]); + c->dst.addr.mem.seg = VCPU_SREG_ES; + c->dst.val = 0; + break; + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + default: + c->dst.type = OP_NONE; /* Disable writeback. */ + return 0; + } + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + +static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + /* The second termination condition only applies for REPE + * and REPNE. Test if the repeat string operation prefix is + * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the + * corresponding termination condition according to: + * - if REPE/REPZ and ZF = 0 then done + * - if REPNE/REPNZ and ZF = 1 then done + */ + if (((c->b == 0xa6) || (c->b == 0xa7) || + (c->b == 0xae) || (c->b == 0xaf)) + && (((c->rep_prefix == REPE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == 0)) + || ((c->rep_prefix == REPNE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) + return true; + + return false; +} + +int +x86_emulate_insn(struct x86_emulate_ctxt *ctxt) +{ + struct x86_emulate_ops *ops = ctxt->ops; + u64 msr_data; + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + int saved_dst_type = c->dst.type; + int irq; /* Used for int 3, int, and into */ + + ctxt->decode.mem_read.pos = 0; + + if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { + rc = emulate_ud(ctxt); + goto done; + } + + /* LOCK prefix is allowed only with some instructions */ + if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { + rc = emulate_ud(ctxt); + goto done; + } + + if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { + rc = emulate_ud(ctxt); + goto done; + } + + /* Privileged instruction can be executed only in CPL=0 */ + if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { + rc = emulate_gp(ctxt, 0); + goto done; + } + + if (c->rep_prefix && (c->d & String)) { + /* All REP prefixes have the same first termination condition */ + if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { + ctxt->eip = c->eip; + goto done; + } + } + + if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { + rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem), + c->src.valptr, c->src.bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + c->src.orig_val64 = c->src.val64; + } + + if (c->src2.type == OP_MEM) { + rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem), + &c->src2.val, c->src2.bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if ((c->d & DstMask) == ImplicitOps) + goto special_insn; + + + if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { + /* optimisation - avoid slow emulated read if Mov */ + rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem), + &c->dst.val, c->dst.bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + } + c->dst.orig_val = c->dst.val; + +special_insn: + + if (c->execute) { + rc = c->execute(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + goto writeback; + } + + if (c->twobyte) + goto twobyte_insn; + + switch (c->b) { + case 0x00 ... 0x05: + add: /* add */ + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + break; + case 0x06: /* push es */ + emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); + break; + case 0x07: /* pop es */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); + break; + case 0x08 ... 0x0d: + or: /* or */ + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + break; + case 0x0e: /* push cs */ + emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); + break; + case 0x10 ... 0x15: + adc: /* adc */ + emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + break; + case 0x16: /* push ss */ + emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); + break; + case 0x17: /* pop ss */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); + break; + case 0x18 ... 0x1d: + sbb: /* sbb */ + emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); + break; + case 0x1e: /* push ds */ + emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); + break; + case 0x1f: /* pop ds */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); + break; + case 0x20 ... 0x25: + and: /* and */ + emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); + break; + case 0x28 ... 0x2d: + sub: /* sub */ + emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); + break; + case 0x30 ... 0x35: + xor: /* xor */ + emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); + break; + case 0x38 ... 0x3d: + cmp: /* cmp */ + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + break; + case 0x40 ... 0x47: /* inc r16/r32 */ + emulate_1op("inc", c->dst, ctxt->eflags); + break; + case 0x48 ... 0x4f: /* dec r16/r32 */ + emulate_1op("dec", c->dst, ctxt->eflags); + break; + case 0x58 ... 0x5f: /* pop reg */ + pop_instruction: + rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); + break; + case 0x60: /* pusha */ + rc = emulate_pusha(ctxt, ops); + break; + case 0x61: /* popa */ + rc = emulate_popa(ctxt, ops); + break; + case 0x63: /* movsxd */ + if (ctxt->mode != X86EMUL_MODE_PROT64) + goto cannot_emulate; + c->dst.val = (s32) c->src.val; + break; + case 0x6c: /* insb */ + case 0x6d: /* insw/insd */ + c->src.val = c->regs[VCPU_REGS_RDX]; + goto do_io_in; + case 0x6e: /* outsb */ + case 0x6f: /* outsw/outsd */ + c->dst.val = c->regs[VCPU_REGS_RDX]; + goto do_io_out; + break; + case 0x70 ... 0x7f: /* jcc (short) */ + if (test_cc(c->b, ctxt->eflags)) + jmp_rel(c, c->src.val); + break; + case 0x80 ... 0x83: /* Grp1 */ + switch (c->modrm_reg) { + case 0: + goto add; + case 1: + goto or; + case 2: + goto adc; + case 3: + goto sbb; + case 4: + goto and; + case 5: + goto sub; + case 6: + goto xor; + case 7: + goto cmp; + } + break; + case 0x84 ... 0x85: + test: + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + break; + case 0x86 ... 0x87: /* xchg */ + xchg: + /* Write back the register source. */ + c->src.val = c->dst.val; + write_register_operand(&c->src); + /* + * Write back the memory destination with implicit LOCK + * prefix. + */ + c->dst.val = c->src.orig_val; + c->lock_prefix = 1; + break; + case 0x8c: /* mov r/m, sreg */ + if (c->modrm_reg > VCPU_SREG_GS) { + rc = emulate_ud(ctxt); + goto done; + } + c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); + break; + case 0x8d: /* lea r16/r32, m */ + c->dst.val = c->src.addr.mem.ea; + break; + case 0x8e: { /* mov seg, r/m16 */ + uint16_t sel; + + sel = c->src.val; + + if (c->modrm_reg == VCPU_SREG_CS || + c->modrm_reg > VCPU_SREG_GS) { + rc = emulate_ud(ctxt); + goto done; + } + + if (c->modrm_reg == VCPU_SREG_SS) + ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + + rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); + + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + } + case 0x8f: /* pop (sole member of Grp1a) */ + rc = emulate_grp1a(ctxt, ops); + break; + case 0x90 ... 0x97: /* nop / xchg reg, rax */ + if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) + break; + goto xchg; + case 0x98: /* cbw/cwde/cdqe */ + switch (c->op_bytes) { + case 2: c->dst.val = (s8)c->dst.val; break; + case 4: c->dst.val = (s16)c->dst.val; break; + case 8: c->dst.val = (s32)c->dst.val; break; + } + break; + case 0x9c: /* pushf */ + c->src.val = (unsigned long) ctxt->eflags; + emulate_push(ctxt, ops); + break; + case 0x9d: /* popf */ + c->dst.type = OP_REG; + c->dst.addr.reg = &ctxt->eflags; + c->dst.bytes = c->op_bytes; + rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + break; + case 0xa6 ... 0xa7: /* cmps */ + c->dst.type = OP_NONE; /* Disable writeback. */ + goto cmp; + case 0xa8 ... 0xa9: /* test ax, imm */ + goto test; + case 0xae ... 0xaf: /* scas */ + goto cmp; + case 0xc0 ... 0xc1: + emulate_grp2(ctxt); + break; + case 0xc3: /* ret */ + c->dst.type = OP_REG; + c->dst.addr.reg = &c->eip; + c->dst.bytes = c->op_bytes; + goto pop_instruction; + case 0xc4: /* les */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); + break; + case 0xc5: /* lds */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); + break; + case 0xcb: /* ret far */ + rc = emulate_ret_far(ctxt, ops); + break; + case 0xcc: /* int3 */ + irq = 3; + goto do_interrupt; + case 0xcd: /* int n */ + irq = c->src.val; + do_interrupt: + rc = emulate_int(ctxt, ops, irq); + break; + case 0xce: /* into */ + if (ctxt->eflags & EFLG_OF) { + irq = 4; + goto do_interrupt; + } + break; + case 0xcf: /* iret */ + rc = emulate_iret(ctxt, ops); + break; + case 0xd0 ... 0xd1: /* Grp2 */ + emulate_grp2(ctxt); + break; + case 0xd2 ... 0xd3: /* Grp2 */ + c->src.val = c->regs[VCPU_REGS_RCX]; + emulate_grp2(ctxt); + break; + case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ + register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); + if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 && + (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) + jmp_rel(c, c->src.val); + break; + case 0xe3: /* jcxz/jecxz/jrcxz */ + if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) + jmp_rel(c, c->src.val); + break; + case 0xe4: /* inb */ + case 0xe5: /* in */ + goto do_io_in; + case 0xe6: /* outb */ + case 0xe7: /* out */ + goto do_io_out; + case 0xe8: /* call (near) */ { + long int rel = c->src.val; + c->src.val = (unsigned long) c->eip; + jmp_rel(c, rel); + emulate_push(ctxt, ops); + break; + } + case 0xe9: /* jmp rel */ + goto jmp; + case 0xea: { /* jmp far */ + unsigned short sel; + jump_far: + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) + goto done; + + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); + break; + } + case 0xeb: + jmp: /* jmp rel short */ + jmp_rel(c, c->src.val); + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xec: /* in al,dx */ + case 0xed: /* in (e/r)ax,dx */ + c->src.val = c->regs[VCPU_REGS_RDX]; + do_io_in: + c->dst.bytes = min(c->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { + rc = emulate_gp(ctxt, 0); + goto done; + } + if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, + &c->dst.val)) + goto done; /* IO is needed */ + break; + case 0xee: /* out dx,al */ + case 0xef: /* out dx,(e/r)ax */ + c->dst.val = c->regs[VCPU_REGS_RDX]; + do_io_out: + c->src.bytes = min(c->src.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->dst.val, + c->src.bytes)) { + rc = emulate_gp(ctxt, 0); + goto done; + } + ops->pio_out_emulated(c->src.bytes, c->dst.val, + &c->src.val, 1, ctxt->vcpu); + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xf4: /* hlt */ + ctxt->vcpu->arch.halt_request = 1; + break; + case 0xf5: /* cmc */ + /* complement carry flag from eflags reg */ + ctxt->eflags ^= EFLG_CF; + break; + case 0xf6 ... 0xf7: /* Grp3 */ + rc = emulate_grp3(ctxt, ops); + break; + case 0xf8: /* clc */ + ctxt->eflags &= ~EFLG_CF; + break; + case 0xf9: /* stc */ + ctxt->eflags |= EFLG_CF; + break; + case 0xfa: /* cli */ + if (emulator_bad_iopl(ctxt, ops)) { + rc = emulate_gp(ctxt, 0); + goto done; + } else + ctxt->eflags &= ~X86_EFLAGS_IF; + break; + case 0xfb: /* sti */ + if (emulator_bad_iopl(ctxt, ops)) { + rc = emulate_gp(ctxt, 0); + goto done; + } else { + ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; + ctxt->eflags |= X86_EFLAGS_IF; + } + break; + case 0xfc: /* cld */ + ctxt->eflags &= ~EFLG_DF; + break; + case 0xfd: /* std */ + ctxt->eflags |= EFLG_DF; + break; + case 0xfe: /* Grp4 */ + grp45: + rc = emulate_grp45(ctxt, ops); + break; + case 0xff: /* Grp5 */ + if (c->modrm_reg == 5) + goto jump_far; + goto grp45; + default: + goto cannot_emulate; + } + + if (rc != X86EMUL_CONTINUE) + goto done; + +writeback: + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + goto done; + + /* + * restore dst type in case the decoding will be reused + * (happens for string instruction ) + */ + c->dst.type = saved_dst_type; + + if ((c->d & SrcMask) == SrcSI) + string_addr_inc(ctxt, seg_override(ctxt, ops, c), + VCPU_REGS_RSI, &c->src); + + if ((c->d & DstMask) == DstDI) + string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, + &c->dst); + + if (c->rep_prefix && (c->d & String)) { + struct read_cache *r = &ctxt->decode.io_read; + register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); + + if (!string_insn_completed(ctxt)) { + /* + * Re-enter guest when pio read ahead buffer is empty + * or, if it is not used, after each 1024 iteration. + */ + if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && + (r->end == 0 || r->end != r->pos)) { + /* + * Reset read cache. Usually happens before + * decode, but since instruction is restarted + * we have to do it here. + */ + ctxt->decode.mem_read.end = 0; + return EMULATION_RESTART; + } + goto done; /* skip rip writeback */ + } + } + + ctxt->eip = c->eip; + +done: + if (rc == X86EMUL_PROPAGATE_FAULT) + ctxt->have_exception = true; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; + +twobyte_insn: + switch (c->b) { + case 0x01: /* lgdt, lidt, lmsw */ + switch (c->modrm_reg) { + u16 size; + unsigned long address; + + case 0: /* vmcall */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + goto cannot_emulate; + + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + + /* Let the processor re-execute the fixed hypercall */ + c->eip = ctxt->eip; + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 2: /* lgdt */ + rc = read_descriptor(ctxt, ops, c->src.addr.mem, + &size, &address, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + realmode_lgdt(ctxt->vcpu, size, address); + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 3: /* lidt/vmmcall */ + if (c->modrm_mod == 3) { + switch (c->modrm_rm) { + case 1: + rc = kvm_fix_hypercall(ctxt->vcpu); + break; + default: + goto cannot_emulate; + } + } else { + rc = read_descriptor(ctxt, ops, c->src.addr.mem, + &size, &address, + c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + realmode_lidt(ctxt->vcpu, size, address); + } + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 4: /* smsw */ + c->dst.bytes = 2; + c->dst.val = ops->get_cr(0, ctxt->vcpu); + break; + case 6: /* lmsw */ + ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) | + (c->src.val & 0x0f), ctxt->vcpu); + c->dst.type = OP_NONE; + break; + case 5: /* not defined */ + emulate_ud(ctxt); + rc = X86EMUL_PROPAGATE_FAULT; + goto done; + case 7: /* invlpg*/ + emulate_invlpg(ctxt->vcpu, + linear(ctxt, c->src.addr.mem)); + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + default: + goto cannot_emulate; + } + break; + case 0x05: /* syscall */ + rc = emulate_syscall(ctxt, ops); + break; + case 0x06: + emulate_clts(ctxt->vcpu); + break; + case 0x09: /* wbinvd */ + kvm_emulate_wbinvd(ctxt->vcpu); + break; + case 0x08: /* invd */ + case 0x0d: /* GrpP (prefetch) */ + case 0x18: /* Grp16 (prefetch/nop) */ + break; + case 0x20: /* mov cr, reg */ + switch (c->modrm_reg) { + case 1: + case 5 ... 7: + case 9 ... 15: + emulate_ud(ctxt); + rc = X86EMUL_PROPAGATE_FAULT; + goto done; + } + c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); + break; + case 0x21: /* mov from dr to reg */ + if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && + (c->modrm_reg == 4 || c->modrm_reg == 5)) { + emulate_ud(ctxt); + rc = X86EMUL_PROPAGATE_FAULT; + goto done; + } + ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); + break; + case 0x22: /* mov reg, cr */ + if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { + emulate_gp(ctxt, 0); + rc = X86EMUL_PROPAGATE_FAULT; + goto done; + } + c->dst.type = OP_NONE; + break; + case 0x23: /* mov from reg to dr */ + if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && + (c->modrm_reg == 4 || c->modrm_reg == 5)) { + emulate_ud(ctxt); + rc = X86EMUL_PROPAGATE_FAULT; + goto done; + } + + if (ops->set_dr(c->modrm_reg, c->src.val & + ((ctxt->mode == X86EMUL_MODE_PROT64) ? + ~0ULL : ~0U), ctxt->vcpu) < 0) { + /* #UD condition is already handled by the code above */ + emulate_gp(ctxt, 0); + rc = X86EMUL_PROPAGATE_FAULT; + goto done; + } + + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x30: + /* wrmsr */ + msr_data = (u32)c->regs[VCPU_REGS_RAX] + | ((u64)c->regs[VCPU_REGS_RDX] << 32); + if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { + emulate_gp(ctxt, 0); + rc = X86EMUL_PROPAGATE_FAULT; + goto done; + } + rc = X86EMUL_CONTINUE; + break; + case 0x32: + /* rdmsr */ + if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { + emulate_gp(ctxt, 0); + rc = X86EMUL_PROPAGATE_FAULT; + goto done; + } else { + c->regs[VCPU_REGS_RAX] = (u32)msr_data; + c->regs[VCPU_REGS_RDX] = msr_data >> 32; + } + rc = X86EMUL_CONTINUE; + break; + case 0x34: /* sysenter */ + rc = emulate_sysenter(ctxt, ops); + break; + case 0x35: /* sysexit */ + rc = emulate_sysexit(ctxt, ops); + break; + case 0x40 ... 0x4f: /* cmov */ + c->dst.val = c->dst.orig_val = c->src.val; + if (!test_cc(c->b, ctxt->eflags)) + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x80 ... 0x8f: /* jnz rel, etc*/ + if (test_cc(c->b, ctxt->eflags)) + jmp_rel(c, c->src.val); + break; + case 0x90 ... 0x9f: /* setcc r/m8 */ + c->dst.val = test_cc(c->b, ctxt->eflags); + break; + case 0xa0: /* push fs */ + emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); + break; + case 0xa1: /* pop fs */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); + break; + case 0xa3: + bt: /* bt */ + c->dst.type = OP_NONE; + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); + break; + case 0xa4: /* shld imm8, r, r/m */ + case 0xa5: /* shld cl, r, r/m */ + emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); + break; + case 0xa8: /* push gs */ + emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); + break; + case 0xa9: /* pop gs */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); + break; + case 0xab: + bts: /* bts */ + emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); + break; + case 0xac: /* shrd imm8, r, r/m */ + case 0xad: /* shrd cl, r, r/m */ + emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); + break; + case 0xae: /* clflush */ + break; + case 0xb0 ... 0xb1: /* cmpxchg */ + /* + * Save real source value, then compare EAX against + * destination. + */ + c->src.orig_val = c->src.val; + c->src.val = c->regs[VCPU_REGS_RAX]; + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + if (ctxt->eflags & EFLG_ZF) { + /* Success: write back to memory. */ + c->dst.val = c->src.orig_val; + } else { + /* Failure: write the value we saw to EAX. */ + c->dst.type = OP_REG; + c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + } + break; + case 0xb2: /* lss */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); + break; + case 0xb3: + btr: /* btr */ + emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); + break; + case 0xb4: /* lfs */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); + break; + case 0xb5: /* lgs */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); + break; + case 0xb6 ... 0xb7: /* movzx */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->d & ByteOp) ? (u8) c->src.val + : (u16) c->src.val; + break; + case 0xba: /* Grp8 */ + switch (c->modrm_reg & 3) { + case 0: + goto bt; + case 1: + goto bts; + case 2: + goto btr; + case 3: + goto btc; + } + break; + case 0xbb: + btc: /* btc */ + emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); + break; + case 0xbc: { /* bsf */ + u8 zf; + __asm__ ("bsf %2, %0; setz %1" + : "=r"(c->dst.val), "=q"(zf) + : "r"(c->src.val)); + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } + break; + } + case 0xbd: { /* bsr */ + u8 zf; + __asm__ ("bsr %2, %0; setz %1" + : "=r"(c->dst.val), "=q"(zf) + : "r"(c->src.val)); + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } + break; + } + case 0xbe ... 0xbf: /* movsx */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : + (s16) c->src.val; + break; + case 0xc0 ... 0xc1: /* xadd */ + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + /* Write back the register source. */ + c->src.val = c->dst.orig_val; + write_register_operand(&c->src); + break; + case 0xc3: /* movnti */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : + (u64) c->src.val; + break; + case 0xc7: /* Grp9 (cmpxchg8b) */ + rc = emulate_grp9(ctxt, ops); + break; + default: + goto cannot_emulate; + } + + if (rc != X86EMUL_CONTINUE) + goto done; + + goto writeback; + +cannot_emulate: + return -1; +} diff --git a/linux/x86/eventfd.c b/linux/x86/eventfd.c new file mode 100644 index 0000000..babaeed --- /dev/null +++ b/linux/x86/eventfd.c @@ -0,0 +1,705 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * kvm eventfd support - use eventfd objects to signal various KVM events + * + * Copyright 2009 Novell. All Rights Reserved. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Author: + * Gregory Haskins <ghaskins@novell.com> + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/workqueue.h> +#include <linux/syscalls.h> +#include <linux/wait.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/list.h> +#include <linux/eventfd.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include "iodev.h" + +/* + * -------------------------------------------------------------------- + * irqfd: Allows an fd to be used to inject an interrupt to the guest + * + * Credit goes to Avi Kivity for the original idea. + * -------------------------------------------------------------------- + */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33) +struct _irqfd { + /* Used for MSI fast-path */ + struct kvm *kvm; + wait_queue_t wait; + /* Update side is protected by irqfds.lock */ + struct kvm_kernel_irq_routing_entry __rcu *irq_entry; + /* Used for level IRQ fast-path */ + int gsi; + struct work_struct inject; + /* Used for setup/shutdown */ + struct eventfd_ctx *eventfd; + struct list_head list; + poll_table pt; + struct work_struct shutdown; +}; + +static struct workqueue_struct *irqfd_cleanup_wq; + +static void +irqfd_inject(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); + struct kvm *kvm = irqfd->kvm; + + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); +} + +/* + * Race-free decouple logic (ordering is critical) + */ +static void +irqfd_shutdown(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); + u64 cnt; + + /* + * Synchronize with the wait-queue and unhook ourselves to prevent + * further events. + */ + eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); + + /* + * We know no new events will be scheduled at this point, so block + * until all previously outstanding events have completed + */ + flush_work(&irqfd->inject); + + /* + * It is now safe to release the object's resources + */ + eventfd_ctx_put(irqfd->eventfd); + kfree(irqfd); +} + + +/* assumes kvm->irqfds.lock is held */ +static bool +irqfd_is_active(struct _irqfd *irqfd) +{ + return list_empty(&irqfd->list) ? false : true; +} + +/* + * Mark the irqfd as inactive and schedule it for removal + * + * assumes kvm->irqfds.lock is held + */ +static void +irqfd_deactivate(struct _irqfd *irqfd) +{ + BUG_ON(!irqfd_is_active(irqfd)); + + list_del_init(&irqfd->list); + + queue_work(irqfd_cleanup_wq, &irqfd->shutdown); +} + +/* + * Called with wqh->lock held and interrupts disabled + */ +static int +irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); + unsigned long flags = (unsigned long)key; + struct kvm_kernel_irq_routing_entry *irq; + struct kvm *kvm = irqfd->kvm; + + if (flags & POLLIN) { + rcu_read_lock(); + irq = rcu_dereference(irqfd->irq_entry); + /* An event has been signaled, inject an interrupt */ + if (irq) + kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); + else + schedule_work(&irqfd->inject); + rcu_read_unlock(); + } + + if (flags & POLLHUP) { + /* The eventfd is closing, detach from KVM */ + unsigned long flags; + + spin_lock_irqsave(&kvm->irqfds.lock, flags); + + /* + * We must check if someone deactivated the irqfd before + * we could acquire the irqfds.lock since the item is + * deactivated from the KVM side before it is unhooked from + * the wait-queue. If it is already deactivated, we can + * simply return knowing the other side will cleanup for us. + * We cannot race against the irqfd going away since the + * other side is required to acquire wqh->lock, which we hold + */ + if (irqfd_is_active(irqfd)) + irqfd_deactivate(irqfd); + + spin_unlock_irqrestore(&kvm->irqfds.lock, flags); + } + + return 0; +} + +static void +irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); + add_wait_queue(wqh, &irqfd->wait); +} + +/* Must be called under irqfds.lock */ +static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, + struct kvm_irq_routing_table *irq_rt) +{ + struct kvm_kernel_irq_routing_entry *e; + struct hlist_node *n; + + if (irqfd->gsi >= irq_rt->nr_rt_entries) { + rcu_assign_pointer(irqfd->irq_entry, NULL); + return; + } + + hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) { + /* Only fast-path MSI. */ + if (e->type == KVM_IRQ_ROUTING_MSI) + rcu_assign_pointer(irqfd->irq_entry, e); + else + rcu_assign_pointer(irqfd->irq_entry, NULL); + } +} + +static int +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) +{ + struct kvm_irq_routing_table *irq_rt; + struct _irqfd *irqfd, *tmp; + struct file *file = NULL; + struct eventfd_ctx *eventfd = NULL; + int ret; + unsigned int events; + + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + if (!irqfd) + return -ENOMEM; + + irqfd->kvm = kvm; + irqfd->gsi = gsi; + INIT_LIST_HEAD(&irqfd->list); + INIT_WORK(&irqfd->inject, irqfd_inject); + INIT_WORK(&irqfd->shutdown, irqfd_shutdown); + + file = eventfd_fget(fd); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + eventfd = eventfd_ctx_fileget(file); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + irqfd->eventfd = eventfd; + + /* + * Install our own custom wake-up handling so we are notified via + * a callback whenever someone signals the underlying eventfd + */ + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); + + spin_lock_irq(&kvm->irqfds.lock); + + ret = 0; + list_for_each_entry(tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd != tmp->eventfd) + continue; + /* This fd is used for another irq already. */ + ret = -EBUSY; + spin_unlock_irq(&kvm->irqfds.lock); + goto fail; + } + + irq_rt = rcu_dereference_protected(kvm->irq_routing, + lockdep_is_held(&kvm->irqfds.lock)); + irqfd_update(kvm, irqfd, irq_rt); + + events = file->f_op->poll(file, &irqfd->pt); + + list_add_tail(&irqfd->list, &kvm->irqfds.items); + + /* + * Check if there was an event already pending on the eventfd + * before we registered, and trigger it as if we didn't miss it. + */ + if (events & POLLIN) + schedule_work(&irqfd->inject); + + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * do not drop the file until the irqfd is fully initialized, otherwise + * we might race against the POLLHUP + */ + fput(file); + + return 0; + +fail: + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + + if (!IS_ERR(file)) + fput(file); + + kfree(irqfd); + return ret; +} + +void +kvm_eventfd_init(struct kvm *kvm) +{ + spin_lock_init(&kvm->irqfds.lock); + INIT_LIST_HEAD(&kvm->irqfds.items); + INIT_LIST_HEAD(&kvm->ioeventfds); +} + +/* + * shutdown any irqfd's that match fd+gsi + */ +static int +kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) +{ + struct _irqfd *irqfd, *tmp; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { + /* + * This rcu_assign_pointer is needed for when + * another thread calls kvm_irqfd_update before + * we flush workqueue below. + * It is paired with synchronize_rcu done by caller + * of that function. + */ + rcu_assign_pointer(irqfd->irq_entry, NULL); + irqfd_deactivate(irqfd); + } + } + + spin_unlock_irq(&kvm->irqfds.lock); + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed + * so that we guarantee there will not be any more interrupts on this + * gsi once this deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +int +kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +{ + if (flags & KVM_IRQFD_FLAG_DEASSIGN) + return kvm_irqfd_deassign(kvm, fd, gsi); + + return kvm_irqfd_assign(kvm, fd, gsi); +} + +/* + * This function is called as the kvm VM fd is being released. Shutdown all + * irqfds that still remain open + */ +void +kvm_irqfd_release(struct kvm *kvm) +{ + struct _irqfd *irqfd, *tmp; + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) + irqfd_deactivate(irqfd); + + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * Block until we know all outstanding shutdown jobs have completed + * since we do not take a kvm* reference. + */ + flush_workqueue(irqfd_cleanup_wq); + +} + +/* + * Change irq_routing and irqfd. + * Caller must invoke synchronize_rcu afterwards. + */ +void kvm_irq_routing_update(struct kvm *kvm, + struct kvm_irq_routing_table *irq_rt) +{ + struct _irqfd *irqfd; + + spin_lock_irq(&kvm->irqfds.lock); + + rcu_assign_pointer(kvm->irq_routing, irq_rt); + + list_for_each_entry(irqfd, &kvm->irqfds.items, list) + irqfd_update(kvm, irqfd, irq_rt); + + spin_unlock_irq(&kvm->irqfds.lock); +} + +/* + * create a host-wide workqueue for issuing deferred shutdown requests + * aggregated from all vm* instances. We need our own isolated single-thread + * queue to prevent deadlock against flushing the normal work-queue. + */ +static int __init irqfd_module_init(void) +{ + irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +static void __exit irqfd_module_exit(void) +{ + destroy_workqueue(irqfd_cleanup_wq); +} + +module_init(irqfd_module_init); +module_exit(irqfd_module_exit); + +/* + * -------------------------------------------------------------------- + * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. + * + * userspace can register a PIO/MMIO address with an eventfd for receiving + * notification when the memory has been touched. + * -------------------------------------------------------------------- + */ + +struct _ioeventfd { + struct list_head list; + u64 addr; + int length; + struct eventfd_ctx *eventfd; + u64 datamatch; + struct kvm_io_device dev; + bool wildcard; +}; + +static inline struct _ioeventfd * +to_ioeventfd(struct kvm_io_device *dev) +{ + return container_of(dev, struct _ioeventfd, dev); +} + +static void +ioeventfd_release(struct _ioeventfd *p) +{ + eventfd_ctx_put(p->eventfd); + list_del(&p->list); + kfree(p); +} + +static bool +ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) +{ + u64 _val; + + if (!(addr == p->addr && len == p->length)) + /* address-range must be precise for a hit */ + return false; + + if (p->wildcard) + /* all else equal, wildcard is always a hit */ + return true; + + /* otherwise, we have to actually compare the data */ + + BUG_ON(!IS_ALIGNED((unsigned long)val, len)); + + switch (len) { + case 1: + _val = *(u8 *)val; + break; + case 2: + _val = *(u16 *)val; + break; + case 4: + _val = *(u32 *)val; + break; + case 8: + _val = *(u64 *)val; + break; + default: + return false; + } + + return _val == p->datamatch ? true : false; +} + +/* MMIO/PIO writes trigger an event if the addr/val match */ +static int +ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct _ioeventfd *p = to_ioeventfd(this); + + if (!ioeventfd_in_range(p, addr, len, val)) + return -EOPNOTSUPP; + + eventfd_signal(p->eventfd, 1); + return 0; +} + +/* + * This function is called as KVM is completely shutting down. We do not + * need to worry about locking just nuke anything we have as quickly as possible + */ +static void +ioeventfd_destructor(struct kvm_io_device *this) +{ + struct _ioeventfd *p = to_ioeventfd(this); + + ioeventfd_release(p); +} + +static const struct kvm_io_device_ops ioeventfd_ops = { + .write = ioeventfd_write, + .destructor = ioeventfd_destructor, +}; + +/* assumes kvm->slots_lock held */ +static bool +ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) +{ + struct _ioeventfd *_p; + + list_for_each_entry(_p, &kvm->ioeventfds, list) + if (_p->addr == p->addr && _p->length == p->length && + (_p->wildcard || p->wildcard || + _p->datamatch == p->datamatch)) + return true; + + return false; +} + +static int +kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; + enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; + struct _ioeventfd *p; + struct eventfd_ctx *eventfd; + int ret; + + /* must be natural-word sized */ + switch (args->len) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return -EINVAL; + } + + /* check for range overflow */ + if (args->addr + args->len < args->addr) + return -EINVAL; + + /* check for extra flags that we don't understand */ + if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) + return -EINVAL; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&p->list); + p->addr = args->addr; + p->length = args->len; + p->eventfd = eventfd; + + /* The datamatch feature is optional, otherwise this is a wildcard */ + if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) + p->datamatch = args->datamatch; + else + p->wildcard = true; + + mutex_lock(&kvm->slots_lock); + + /* Verify that there isnt a match already */ + if (ioeventfd_check_collision(kvm, p)) { + ret = -EEXIST; + goto unlock_fail; + } + + kvm_iodevice_init(&p->dev, &ioeventfd_ops); + + ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); + if (ret < 0) + goto unlock_fail; + + list_add_tail(&p->list, &kvm->ioeventfds); + + mutex_unlock(&kvm->slots_lock); + + return 0; + +unlock_fail: + mutex_unlock(&kvm->slots_lock); + +fail: + kfree(p); + eventfd_ctx_put(eventfd); + + return ret; +} + +static int +kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; + enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; + struct _ioeventfd *p, *tmp; + struct eventfd_ctx *eventfd; + int ret = -ENOENT; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + mutex_lock(&kvm->slots_lock); + + list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { + bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); + + if (p->eventfd != eventfd || + p->addr != args->addr || + p->length != args->len || + p->wildcard != wildcard) + continue; + + if (!p->wildcard && p->datamatch != args->datamatch) + continue; + + kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); + ioeventfd_release(p); + ret = 0; + break; + } + + mutex_unlock(&kvm->slots_lock); + + eventfd_ctx_put(eventfd); + + return ret; +} + +int +kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) + return kvm_deassign_ioeventfd(kvm, args); + + return kvm_assign_ioeventfd(kvm, args); +} +#else +void kvm_eventfd_init(struct kvm *kvm) { } +void kvm_irqfd_release(struct kvm *kvm) { } +void kvm_irq_routing_update(struct kvm *kvm, + struct kvm_irq_routing_table *irq_rt) +{ + rcu_assign_pointer(kvm->irq_routing, irq_rt); +} +#endif diff --git a/linux/x86/i8254.c b/linux/x86/i8254.c new file mode 100644 index 0000000..64aa827 --- /dev/null +++ b/linux/x86/i8254.c @@ -0,0 +1,803 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * 8253/8254 interval timer emulation + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2006 Intel Corporation + * Copyright (c) 2007 Keir Fraser, XenSource Inc + * Copyright (c) 2008 Intel Corporation + * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * Authors: + * Sheng Yang <sheng.yang@intel.com> + * Based on QEMU and Xen. + */ + + + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/workqueue.h> + +#include "irq.h" +#include "i8254.h" + +#ifndef CONFIG_X86_64 +#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) +#else +#define mod_64(x, y) ((x) % (y)) +#endif + +#define RW_STATE_LSB 1 +#define RW_STATE_MSB 2 +#define RW_STATE_WORD0 3 +#define RW_STATE_WORD1 4 + +/* Compute with 96 bit intermediate result: (a*b)/c */ +static u64 muldiv64(u64 a, u32 b, u32 c) +{ + union { + u64 ll; + struct { + u32 low, high; + } l; + } u, res; + u64 rl, rh; + + u.ll = a; + rl = (u64)u.l.low * (u64)b; + rh = (u64)u.l.high * (u64)b; + rh += (rl >> 32); + res.l.high = div64_u64(rh, c); + res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c); + return res.ll; +} + +static void pit_set_gate(struct kvm *kvm, int channel, u32 val) +{ + struct kvm_kpit_channel_state *c = + &kvm->arch.vpit->pit_state.channels[channel]; + + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + switch (c->mode) { + default: + case 0: + case 4: + /* XXX: just disable/enable counting */ + break; + case 1: + case 2: + case 3: + case 5: + /* Restart counting on rising edge. */ + if (c->gate < val) + c->count_load_time = ktime_get(); + break; + } + + c->gate = val; +} + +static int pit_get_gate(struct kvm *kvm, int channel) +{ + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + return kvm->arch.vpit->pit_state.channels[channel].gate; +} + +static s64 __kpit_elapsed(struct kvm *kvm) +{ + s64 elapsed; + ktime_t remaining; + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + + if (!ps->pit_timer.period) + return 0; + + /* + * The Counter does not stop when it reaches zero. In + * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to + * the highest count, either FFFF hex for binary counting + * or 9999 for BCD counting, and continues counting. + * Modes 2 and 3 are periodic; the Counter reloads + * itself with the initial count and continues counting + * from there. + */ + remaining = hrtimer_get_remaining(&ps->pit_timer.timer); + elapsed = ps->pit_timer.period - ktime_to_ns(remaining); + elapsed = mod_64(elapsed, ps->pit_timer.period); + + return elapsed; +} + +static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c, + int channel) +{ + if (channel == 0) + return __kpit_elapsed(kvm); + + return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); +} + +static int pit_get_count(struct kvm *kvm, int channel) +{ + struct kvm_kpit_channel_state *c = + &kvm->arch.vpit->pit_state.channels[channel]; + s64 d, t; + int counter; + + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + t = kpit_elapsed(kvm, c, channel); + d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); + + switch (c->mode) { + case 0: + case 1: + case 4: + case 5: + counter = (c->count - d) & 0xffff; + break; + case 3: + /* XXX: may be incorrect for odd counts */ + counter = c->count - (mod_64((2 * d), c->count)); + break; + default: + counter = c->count - mod_64(d, c->count); + break; + } + return counter; +} + +static int pit_get_out(struct kvm *kvm, int channel) +{ + struct kvm_kpit_channel_state *c = + &kvm->arch.vpit->pit_state.channels[channel]; + s64 d, t; + int out; + + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + t = kpit_elapsed(kvm, c, channel); + d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); + + switch (c->mode) { + default: + case 0: + out = (d >= c->count); + break; + case 1: + out = (d < c->count); + break; + case 2: + out = ((mod_64(d, c->count) == 0) && (d != 0)); + break; + case 3: + out = (mod_64(d, c->count) < ((c->count + 1) >> 1)); + break; + case 4: + case 5: + out = (d == c->count); + break; + } + + return out; +} + +static void pit_latch_count(struct kvm *kvm, int channel) +{ + struct kvm_kpit_channel_state *c = + &kvm->arch.vpit->pit_state.channels[channel]; + + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + if (!c->count_latched) { + c->latched_count = pit_get_count(kvm, channel); + c->count_latched = c->rw_mode; + } +} + +static void pit_latch_status(struct kvm *kvm, int channel) +{ + struct kvm_kpit_channel_state *c = + &kvm->arch.vpit->pit_state.channels[channel]; + + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + if (!c->status_latched) { + /* TODO: Return NULL COUNT (bit 6). */ + c->status = ((pit_get_out(kvm, channel) << 7) | + (c->rw_mode << 4) | + (c->mode << 1) | + c->bcd); + c->status_latched = 1; + } +} + +static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, + irq_ack_notifier); + int value; + + spin_lock(&ps->inject_lock); + value = atomic_dec_return(&ps->pit_timer.pending); + if (value < 0) + /* spurious acks can be generated if, for example, the + * PIC is being reset. Handle it gracefully here + */ + atomic_inc(&ps->pit_timer.pending); + else if (value > 0) + /* in this case, we had multiple outstanding pit interrupts + * that we needed to inject. Reinject + */ + queue_work(ps->pit->wq, &ps->pit->expired); + ps->irq_ack = 1; + spin_unlock(&ps->inject_lock); +} + +void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_pit *pit = vcpu->kvm->arch.vpit; + struct hrtimer *timer; + + if (!kvm_vcpu_is_bsp(vcpu) || !pit) + return; + + timer = &pit->pit_state.pit_timer.timer; + if (hrtimer_cancel(timer)) + kvm_hrtimer_start_expires(timer, HRTIMER_MODE_ABS); +} + +static void destroy_pit_timer(struct kvm_pit *pit) +{ + hrtimer_cancel(&pit->pit_state.pit_timer.timer); + cancel_work_sync(&pit->expired); +} + +static bool kpit_is_periodic(struct kvm_timer *ktimer) +{ + struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state, + pit_timer); + return ps->is_periodic; +} + +static struct kvm_timer_ops kpit_ops = { + .is_periodic = kpit_is_periodic, +}; + +static void pit_do_work(struct work_struct *work) +{ + struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); + struct kvm *kvm = pit->kvm; + struct kvm_vcpu *vcpu; + int i; + struct kvm_kpit_state *ps = &pit->pit_state; + int inject = 0; + + /* Try to inject pending interrupts when + * last one has been acked. + */ + spin_lock(&ps->inject_lock); + if (ps->irq_ack) { + ps->irq_ack = 0; + inject = 1; + } + spin_unlock(&ps->inject_lock); + if (inject) { + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> PIC -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation is simplified, only + * propagating PIT interrupts to all VCPUs when they have set + * LVT0 to NMI delivery. Other PIC interrupts are just sent to + * VCPU0, and only if its LVT0 is in EXTINT mode. + */ + if (kvm->arch.vapics_in_nmi_mode > 0) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); + } +} + +static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) +{ + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_pit *pt = ktimer->kvm->arch.vpit; + + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); + queue_work(pt->wq, &pt->expired); + } + + if (ktimer->t_ops->is_periodic(ktimer)) { + kvm_hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + return HRTIMER_RESTART; + } else + return HRTIMER_NORESTART; +} + +static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) +{ + struct kvm_timer *pt = &ps->pit_timer; + s64 interval; + + interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); + + pr_debug("pit: " "create pit timer, interval is %llu nsec\n", interval); + + /* TODO The new value only affected after the retriggered */ + hrtimer_cancel(&pt->timer); + cancel_work_sync(&ps->pit->expired); + pt->period = interval; + ps->is_periodic = is_period; + + pt->timer.function = pit_timer_fn; + pt->t_ops = &kpit_ops; + pt->kvm = ps->pit->kvm; + + atomic_set(&pt->pending, 0); + ps->irq_ack = 1; + + hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), + HRTIMER_MODE_ABS); +} + +static void pit_load_count(struct kvm *kvm, int channel, u32 val) +{ + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + + WARN_ON(!mutex_is_locked(&ps->lock)); + + pr_debug("pit: " "load_count val is %d, channel is %d\n", val, channel); + + /* + * The largest possible initial count is 0; this is equivalent + * to 216 for binary counting and 104 for BCD counting. + */ + if (val == 0) + val = 0x10000; + + ps->channels[channel].count = val; + + if (channel != 0) { + ps->channels[channel].count_load_time = ktime_get(); + return; + } + + /* Two types of timer + * mode 1 is one shot, mode 2 is period, otherwise del timer */ + switch (ps->channels[0].mode) { + case 0: + case 1: + /* FIXME: enhance mode 4 precision */ + case 4: + if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { + create_pit_timer(ps, val, 0); + } + break; + case 2: + case 3: + if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ + create_pit_timer(ps, val, 1); + } + break; + default: + destroy_pit_timer(kvm->arch.vpit); + } +} + +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start) +{ + u8 saved_mode; + if (hpet_legacy_start) { + /* save existing mode for later reenablement */ + saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; + kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ + pit_load_count(kvm, channel, val); + kvm->arch.vpit->pit_state.channels[0].mode = saved_mode; + } else { + pit_load_count(kvm, channel, val); + } +} + +static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_pit, dev); +} + +static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_pit, speaker_dev); +} + +static inline int pit_in_range(gpa_t addr) +{ + return ((addr >= KVM_PIT_BASE_ADDRESS) && + (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); +} + +static int pit_ioport_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *data) +{ + struct kvm_pit *pit = dev_to_pit(this); + struct kvm_kpit_state *pit_state = &pit->pit_state; + struct kvm *kvm = pit->kvm; + int channel, access; + struct kvm_kpit_channel_state *s; + u32 val = *(u32 *) data; + if (!pit_in_range(addr)) + return -EOPNOTSUPP; + + val &= 0xff; + addr &= KVM_PIT_CHANNEL_MASK; + + mutex_lock(&pit_state->lock); + + if (val != 0) + pr_debug("pit: " "write addr is 0x%x, len is %d, val is 0x%x\n", + (unsigned int)addr, len, val); + + if (addr == 3) { + channel = val >> 6; + if (channel == 3) { + /* Read-Back Command. */ + for (channel = 0; channel < 3; channel++) { + s = &pit_state->channels[channel]; + if (val & (2 << channel)) { + if (!(val & 0x20)) + pit_latch_count(kvm, channel); + if (!(val & 0x10)) + pit_latch_status(kvm, channel); + } + } + } else { + /* Select Counter <channel>. */ + s = &pit_state->channels[channel]; + access = (val >> 4) & KVM_PIT_CHANNEL_MASK; + if (access == 0) { + pit_latch_count(kvm, channel); + } else { + s->rw_mode = access; + s->read_state = access; + s->write_state = access; + s->mode = (val >> 1) & 7; + if (s->mode > 5) + s->mode -= 4; + s->bcd = val & 1; + } + } + } else { + /* Write Count. */ + s = &pit_state->channels[addr]; + switch (s->write_state) { + default: + case RW_STATE_LSB: + pit_load_count(kvm, addr, val); + break; + case RW_STATE_MSB: + pit_load_count(kvm, addr, val << 8); + break; + case RW_STATE_WORD0: + s->write_latch = val; + s->write_state = RW_STATE_WORD1; + break; + case RW_STATE_WORD1: + pit_load_count(kvm, addr, s->write_latch | (val << 8)); + s->write_state = RW_STATE_WORD0; + break; + } + } + + mutex_unlock(&pit_state->lock); + return 0; +} + +static int pit_ioport_read(struct kvm_io_device *this, + gpa_t addr, int len, void *data) +{ + struct kvm_pit *pit = dev_to_pit(this); + struct kvm_kpit_state *pit_state = &pit->pit_state; + struct kvm *kvm = pit->kvm; + int ret, count; + struct kvm_kpit_channel_state *s; + if (!pit_in_range(addr)) + return -EOPNOTSUPP; + + addr &= KVM_PIT_CHANNEL_MASK; + if (addr == 3) + return 0; + + s = &pit_state->channels[addr]; + + mutex_lock(&pit_state->lock); + + if (s->status_latched) { + s->status_latched = 0; + ret = s->status; + } else if (s->count_latched) { + switch (s->count_latched) { + default: + case RW_STATE_LSB: + ret = s->latched_count & 0xff; + s->count_latched = 0; + break; + case RW_STATE_MSB: + ret = s->latched_count >> 8; + s->count_latched = 0; + break; + case RW_STATE_WORD0: + ret = s->latched_count & 0xff; + s->count_latched = RW_STATE_MSB; + break; + } + } else { + switch (s->read_state) { + default: + case RW_STATE_LSB: + count = pit_get_count(kvm, addr); + ret = count & 0xff; + break; + case RW_STATE_MSB: + count = pit_get_count(kvm, addr); + ret = (count >> 8) & 0xff; + break; + case RW_STATE_WORD0: + count = pit_get_count(kvm, addr); + ret = count & 0xff; + s->read_state = RW_STATE_WORD1; + break; + case RW_STATE_WORD1: + count = pit_get_count(kvm, addr); + ret = (count >> 8) & 0xff; + s->read_state = RW_STATE_WORD0; + break; + } + } + + if (len > sizeof(ret)) + len = sizeof(ret); + memcpy(data, (char *)&ret, len); + + mutex_unlock(&pit_state->lock); + return 0; +} + +static int speaker_ioport_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *data) +{ + struct kvm_pit *pit = speaker_to_pit(this); + struct kvm_kpit_state *pit_state = &pit->pit_state; + struct kvm *kvm = pit->kvm; + u32 val = *(u32 *) data; + if (addr != KVM_SPEAKER_BASE_ADDRESS) + return -EOPNOTSUPP; + + mutex_lock(&pit_state->lock); + pit_state->speaker_data_on = (val >> 1) & 1; + pit_set_gate(kvm, 2, val & 1); + mutex_unlock(&pit_state->lock); + return 0; +} + +static int speaker_ioport_read(struct kvm_io_device *this, + gpa_t addr, int len, void *data) +{ + struct kvm_pit *pit = speaker_to_pit(this); + struct kvm_kpit_state *pit_state = &pit->pit_state; + struct kvm *kvm = pit->kvm; + unsigned int refresh_clock; + int ret; + if (addr != KVM_SPEAKER_BASE_ADDRESS) + return -EOPNOTSUPP; + + /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ + refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; + + mutex_lock(&pit_state->lock); + ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) | + (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4)); + if (len > sizeof(ret)) + len = sizeof(ret); + memcpy(data, (char *)&ret, len); + mutex_unlock(&pit_state->lock); + return 0; +} + +void kvm_pit_reset(struct kvm_pit *pit) +{ + int i; + struct kvm_kpit_channel_state *c; + + mutex_lock(&pit->pit_state.lock); + pit->pit_state.flags = 0; + for (i = 0; i < 3; i++) { + c = &pit->pit_state.channels[i]; + c->mode = 0xff; + c->gate = (i != 2); + pit_load_count(pit->kvm, i, 0); + } + mutex_unlock(&pit->pit_state.lock); + + atomic_set(&pit->pit_state.pit_timer.pending, 0); + pit->pit_state.irq_ack = 1; +} + +static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) +{ + struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); + + if (!mask) { + atomic_set(&pit->pit_state.pit_timer.pending, 0); + pit->pit_state.irq_ack = 1; + } +} + +static const struct kvm_io_device_ops pit_dev_ops = { + .read = pit_ioport_read, + .write = pit_ioport_write, +}; + +static const struct kvm_io_device_ops speaker_dev_ops = { + .read = speaker_ioport_read, + .write = speaker_ioport_write, +}; + +/* Caller must hold slots_lock */ +struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) +{ + struct kvm_pit *pit; + struct kvm_kpit_state *pit_state; + int ret; + + pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); + if (!pit) + return NULL; + + pit->irq_source_id = kvm_request_irq_source_id(kvm); + if (pit->irq_source_id < 0) { + kfree(pit); + return NULL; + } + + mutex_init(&pit->pit_state.lock); + mutex_lock(&pit->pit_state.lock); + spin_lock_init(&pit->pit_state.inject_lock); + + pit->wq = create_singlethread_workqueue("kvm-pit-wq"); + if (!pit->wq) { + mutex_unlock(&pit->pit_state.lock); + kvm_free_irq_source_id(kvm, pit->irq_source_id); + kfree(pit); + return NULL; + } + INIT_WORK(&pit->expired, pit_do_work); + + kvm->arch.vpit = pit; + pit->kvm = kvm; + + pit_state = &pit->pit_state; + pit_state->pit = pit; + hrtimer_init(&pit_state->pit_timer.timer, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + pit_state->irq_ack_notifier.gsi = 0; + pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; + kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + pit_state->pit_timer.reinject = true; + mutex_unlock(&pit->pit_state.lock); + + kvm_pit_reset(pit); + + pit->mask_notifier.func = pit_mask_notifer; + kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + + kvm_iodevice_init(&pit->dev, &pit_dev_ops); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); + if (ret < 0) + goto fail; + + if (flags & KVM_PIT_SPEAKER_DUMMY) { + kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, + &pit->speaker_dev); + if (ret < 0) + goto fail_unregister; + } + + return pit; + +fail_unregister: + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); + +fail: + kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + kvm_free_irq_source_id(kvm, pit->irq_source_id); + destroy_workqueue(pit->wq); + kfree(pit); + return NULL; +} + +void kvm_free_pit(struct kvm *kvm) +{ + struct hrtimer *timer; + + if (kvm->arch.vpit) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &kvm->arch.vpit->speaker_dev); + kvm_unregister_irq_mask_notifier(kvm, 0, + &kvm->arch.vpit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, + &kvm->arch.vpit->pit_state.irq_ack_notifier); + mutex_lock(&kvm->arch.vpit->pit_state.lock); + timer = &kvm->arch.vpit->pit_state.pit_timer.timer; + hrtimer_cancel(timer); + cancel_work_sync(&kvm->arch.vpit->expired); + kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + destroy_workqueue(kvm->arch.vpit->wq); + kfree(kvm->arch.vpit); + } +} diff --git a/linux/x86/i8254.h b/linux/x86/i8254.h new file mode 100644 index 0000000..46d08ca --- /dev/null +++ b/linux/x86/i8254.h @@ -0,0 +1,60 @@ +#ifndef __I8254_H +#define __I8254_H + +#include "iodev.h" + +struct kvm_kpit_channel_state { + u32 count; /* can be 65536 */ + u16 latched_count; + u8 count_latched; + u8 status_latched; + u8 status; + u8 read_state; + u8 write_state; + u8 write_latch; + u8 rw_mode; + u8 mode; + u8 bcd; /* not supported */ + u8 gate; /* timer start */ + ktime_t count_load_time; +}; + +struct kvm_kpit_state { + struct kvm_kpit_channel_state channels[3]; + u32 flags; + struct kvm_timer pit_timer; + bool is_periodic; + u32 speaker_data_on; + struct mutex lock; + struct kvm_pit *pit; + spinlock_t inject_lock; + unsigned long irq_ack; + struct kvm_irq_ack_notifier irq_ack_notifier; +}; + +struct kvm_pit { + unsigned long base_addresss; + struct kvm_io_device dev; + struct kvm_io_device speaker_dev; + struct kvm *kvm; + struct kvm_kpit_state pit_state; + int irq_source_id; + struct kvm_irq_mask_notifier mask_notifier; + struct workqueue_struct *wq; + struct work_struct expired; +}; + +#define KVM_PIT_BASE_ADDRESS 0x40 +#define KVM_SPEAKER_BASE_ADDRESS 0x61 +#define KVM_PIT_MEM_LENGTH 4 +#define KVM_PIT_FREQ 1193181 +#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 +#define KVM_PIT_CHANNEL_MASK 0x3 + +void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); +struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); +void kvm_free_pit(struct kvm *kvm); +void kvm_pit_reset(struct kvm_pit *pit); + +#endif diff --git a/linux/x86/i8259.c b/linux/x86/i8259.c new file mode 100644 index 0000000..d316fc6 --- /dev/null +++ b/linux/x86/i8259.c @@ -0,0 +1,645 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * 8259 interrupt controller emulation + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2007 Intel Corporation + * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * Port from Qemu. + */ +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/bitops.h> +#include "irq.h" + +#include <linux/kvm_host.h> +#include "trace.h" + +static void pic_irq_request(struct kvm *kvm, int level); + +static void pic_lock(struct kvm_pic *s) + __acquires(&s->lock) +{ + spin_lock(&s->lock); +} + +static void pic_unlock(struct kvm_pic *s) + __releases(&s->lock) +{ + bool wakeup = s->wakeup_needed; + struct kvm_vcpu *vcpu, *found = NULL; + int i; + + s->wakeup_needed = false; + + spin_unlock(&s->lock); + + if (wakeup) { + kvm_for_each_vcpu(i, vcpu, s->kvm) { + if (kvm_apic_accept_pic_intr(vcpu)) { + found = vcpu; + break; + } + } + + if (!found) + found = s->kvm->bsp_vcpu; + + if (!found) + return; + + kvm_make_request(KVM_REQ_EVENT, found); + kvm_vcpu_kick(found); + } +} + +static void pic_clear_isr(struct kvm_kpic_state *s, int irq) +{ + s->isr &= ~(1 << irq); + s->isr_ack |= (1 << irq); + if (s != &s->pics_state->pics[0]) + irq += 8; + /* + * We are dropping lock while calling ack notifiers since ack + * notifier callbacks for assigned devices call into PIC recursively. + * Other interrupt may be delivered to PIC while lock is dropped but + * it should be safe since PIC state is already updated at this stage. + */ + pic_unlock(s->pics_state); + kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); + pic_lock(s->pics_state); +} + +void kvm_pic_clear_isr_ack(struct kvm *kvm) +{ + struct kvm_pic *s = pic_irqchip(kvm); + + pic_lock(s); + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; + pic_unlock(s); +} + +/* + * set irq level. If an edge is detected, then the IRR is set to 1 + */ +static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) +{ + int mask, ret = 1; + mask = 1 << irq; + if (s->elcr & mask) /* level triggered */ + if (level) { + ret = !(s->irr & mask); + s->irr |= mask; + s->last_irr |= mask; + } else { + s->irr &= ~mask; + s->last_irr &= ~mask; + } + else /* edge triggered */ + if (level) { + if ((s->last_irr & mask) == 0) { + ret = !(s->irr & mask); + s->irr |= mask; + } + s->last_irr |= mask; + } else + s->last_irr &= ~mask; + + return (s->imr & mask) ? -1 : ret; +} + +/* + * return the highest priority found in mask (highest = smallest + * number). Return 8 if no irq + */ +static inline int get_priority(struct kvm_kpic_state *s, int mask) +{ + int priority; + if (mask == 0) + return 8; + priority = 0; + while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0) + priority++; + return priority; +} + +/* + * return the pic wanted interrupt. return -1 if none + */ +static int pic_get_irq(struct kvm_kpic_state *s) +{ + int mask, cur_priority, priority; + + mask = s->irr & ~s->imr; + priority = get_priority(s, mask); + if (priority == 8) + return -1; + /* + * compute current priority. If special fully nested mode on the + * master, the IRQ coming from the slave is not taken into account + * for the priority computation. + */ + mask = s->isr; + if (s->special_fully_nested_mode && s == &s->pics_state->pics[0]) + mask &= ~(1 << 2); + cur_priority = get_priority(s, mask); + if (priority < cur_priority) + /* + * higher priority found: an irq should be generated + */ + return (priority + s->priority_add) & 7; + else + return -1; +} + +/* + * raise irq to CPU if necessary. must be called every time the active + * irq may change + */ +static void pic_update_irq(struct kvm_pic *s) +{ + int irq2, irq; + + irq2 = pic_get_irq(&s->pics[1]); + if (irq2 >= 0) { + /* + * if irq request by slave pic, signal master PIC + */ + pic_set_irq1(&s->pics[0], 2, 1); + pic_set_irq1(&s->pics[0], 2, 0); + } + irq = pic_get_irq(&s->pics[0]); + pic_irq_request(s->kvm, irq >= 0); +} + +void kvm_pic_update_irq(struct kvm_pic *s) +{ + pic_lock(s); + pic_update_irq(s); + pic_unlock(s); +} + +int kvm_pic_set_irq(void *opaque, int irq, int level) +{ + struct kvm_pic *s = opaque; + int ret = -1; + + pic_lock(s); + if (irq >= 0 && irq < PIC_NUM_PINS) { + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + pic_update_irq(s); + trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, + s->pics[irq >> 3].imr, ret == 0); + } + pic_unlock(s); + + return ret; +} + +/* + * acknowledge interrupt 'irq' + */ +static inline void pic_intack(struct kvm_kpic_state *s, int irq) +{ + s->isr |= 1 << irq; + /* + * We don't clear a level sensitive interrupt here + */ + if (!(s->elcr & (1 << irq))) + s->irr &= ~(1 << irq); + + if (s->auto_eoi) { + if (s->rotate_on_auto_eoi) + s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); + } + +} + +int kvm_pic_read_irq(struct kvm *kvm) +{ + int irq, irq2, intno; + struct kvm_pic *s = pic_irqchip(kvm); + + pic_lock(s); + irq = pic_get_irq(&s->pics[0]); + if (irq >= 0) { + pic_intack(&s->pics[0], irq); + if (irq == 2) { + irq2 = pic_get_irq(&s->pics[1]); + if (irq2 >= 0) + pic_intack(&s->pics[1], irq2); + else + /* + * spurious IRQ on slave controller + */ + irq2 = 7; + intno = s->pics[1].irq_base + irq2; + irq = irq2 + 8; + } else + intno = s->pics[0].irq_base + irq; + } else { + /* + * spurious IRQ on host controller + */ + irq = 7; + intno = s->pics[0].irq_base + irq; + } + pic_update_irq(s); + pic_unlock(s); + + return intno; +} + +void kvm_pic_reset(struct kvm_kpic_state *s) +{ + int irq; + struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; + u8 irr = s->irr, isr = s->imr; + + s->last_irr = 0; + s->irr = 0; + s->imr = 0; + s->isr = 0; + s->isr_ack = 0xff; + s->priority_add = 0; + s->irq_base = 0; + s->read_reg_select = 0; + s->poll = 0; + s->special_mask = 0; + s->init_state = 0; + s->auto_eoi = 0; + s->rotate_on_auto_eoi = 0; + s->special_fully_nested_mode = 0; + s->init4 = 0; + + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { + if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) + if (irr & (1 << irq) || isr & (1 << irq)) { + pic_clear_isr(s, irq); + } + } +} + +static void pic_ioport_write(void *opaque, u32 addr, u32 val) +{ + struct kvm_kpic_state *s = opaque; + int priority, cmd, irq; + + addr &= 1; + if (addr == 0) { + if (val & 0x10) { + s->init4 = val & 1; + s->last_irr = 0; + s->imr = 0; + s->priority_add = 0; + s->special_mask = 0; + s->read_reg_select = 0; + if (!s->init4) { + s->special_fully_nested_mode = 0; + s->auto_eoi = 0; + } + s->init_state = 1; + if (val & 0x02) + printk(KERN_ERR "single mode not supported"); + if (val & 0x08) + printk(KERN_ERR + "level sensitive irq not supported"); + } else if (val & 0x08) { + if (val & 0x04) + s->poll = 1; + if (val & 0x02) + s->read_reg_select = val & 1; + if (val & 0x40) + s->special_mask = (val >> 5) & 1; + } else { + cmd = val >> 5; + switch (cmd) { + case 0: + case 4: + s->rotate_on_auto_eoi = cmd >> 2; + break; + case 1: /* end of interrupt */ + case 5: + priority = get_priority(s, s->isr); + if (priority != 8) { + irq = (priority + s->priority_add) & 7; + if (cmd == 5) + s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); + pic_update_irq(s->pics_state); + } + break; + case 3: + irq = val & 7; + pic_clear_isr(s, irq); + pic_update_irq(s->pics_state); + break; + case 6: + s->priority_add = (val + 1) & 7; + pic_update_irq(s->pics_state); + break; + case 7: + irq = val & 7; + s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); + pic_update_irq(s->pics_state); + break; + default: + break; /* no operation */ + } + } + } else + switch (s->init_state) { + case 0: { /* normal mode */ + u8 imr_diff = s->imr ^ val, + off = (s == &s->pics_state->pics[0]) ? 0 : 8; + s->imr = val; + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) + if (imr_diff & (1 << irq)) + kvm_fire_mask_notifiers( + s->pics_state->kvm, + SELECT_PIC(irq + off), + irq + off, + !!(s->imr & (1 << irq))); + pic_update_irq(s->pics_state); + break; + } + case 1: + s->irq_base = val & 0xf8; + s->init_state = 2; + break; + case 2: + if (s->init4) + s->init_state = 3; + else + s->init_state = 0; + break; + case 3: + s->special_fully_nested_mode = (val >> 4) & 1; + s->auto_eoi = (val >> 1) & 1; + s->init_state = 0; + break; + } +} + +static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) +{ + int ret; + + ret = pic_get_irq(s); + if (ret >= 0) { + if (addr1 >> 7) { + s->pics_state->pics[0].isr &= ~(1 << 2); + s->pics_state->pics[0].irr &= ~(1 << 2); + } + s->irr &= ~(1 << ret); + pic_clear_isr(s, ret); + if (addr1 >> 7 || ret != 2) + pic_update_irq(s->pics_state); + } else { + ret = 0x07; + pic_update_irq(s->pics_state); + } + + return ret; +} + +static u32 pic_ioport_read(void *opaque, u32 addr1) +{ + struct kvm_kpic_state *s = opaque; + unsigned int addr; + int ret; + + addr = addr1; + addr &= 1; + if (s->poll) { + ret = pic_poll_read(s, addr1); + s->poll = 0; + } else + if (addr == 0) + if (s->read_reg_select) + ret = s->isr; + else + ret = s->irr; + else + ret = s->imr; + return ret; +} + +static void elcr_ioport_write(void *opaque, u32 addr, u32 val) +{ + struct kvm_kpic_state *s = opaque; + s->elcr = val & s->elcr_mask; +} + +static u32 elcr_ioport_read(void *opaque, u32 addr1) +{ + struct kvm_kpic_state *s = opaque; + return s->elcr; +} + +static int picdev_in_range(gpa_t addr) +{ + switch (addr) { + case 0x20: + case 0x21: + case 0xa0: + case 0xa1: + case 0x4d0: + case 0x4d1: + return 1; + default: + return 0; + } +} + +static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_pic, dev); +} + +static int picdev_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *val) +{ + struct kvm_pic *s = to_pic(this); + unsigned char data = *(unsigned char *)val; + if (!picdev_in_range(addr)) + return -EOPNOTSUPP; + + if (len != 1) { + if (printk_ratelimit()) + printk(KERN_ERR "PIC: non byte write\n"); + return 0; + } + pic_lock(s); + switch (addr) { + case 0x20: + case 0x21: + case 0xa0: + case 0xa1: + pic_ioport_write(&s->pics[addr >> 7], addr, data); + break; + case 0x4d0: + case 0x4d1: + elcr_ioport_write(&s->pics[addr & 1], addr, data); + break; + } + pic_unlock(s); + return 0; +} + +static int picdev_read(struct kvm_io_device *this, + gpa_t addr, int len, void *val) +{ + struct kvm_pic *s = to_pic(this); + unsigned char data = 0; + if (!picdev_in_range(addr)) + return -EOPNOTSUPP; + + if (len != 1) { + if (printk_ratelimit()) + printk(KERN_ERR "PIC: non byte read\n"); + return 0; + } + pic_lock(s); + switch (addr) { + case 0x20: + case 0x21: + case 0xa0: + case 0xa1: + data = pic_ioport_read(&s->pics[addr >> 7], addr); + break; + case 0x4d0: + case 0x4d1: + data = elcr_ioport_read(&s->pics[addr & 1], addr); + break; + } + *(unsigned char *)val = data; + pic_unlock(s); + return 0; +} + +/* + * callback when PIC0 irq status changed + */ +static void pic_irq_request(struct kvm *kvm, int level) +{ + struct kvm_vcpu *vcpu = kvm->bsp_vcpu; + struct kvm_pic *s = pic_irqchip(kvm); + int irq = pic_get_irq(&s->pics[0]); + + s->output = level; + if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { + s->pics[0].isr_ack &= ~(1 << irq); + s->wakeup_needed = true; + } +} + +static const struct kvm_io_device_ops picdev_ops = { + .read = picdev_read, + .write = picdev_write, +}; + +struct kvm_pic *kvm_create_pic(struct kvm *kvm) +{ + struct kvm_pic *s; + int ret; + + s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); + if (!s) + return NULL; + spin_lock_init(&s->lock); + s->kvm = kvm; + s->pics[0].elcr_mask = 0xf8; + s->pics[1].elcr_mask = 0xde; + s->pics[0].pics_state = s; + s->pics[1].pics_state = s; + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; + + /* + * Initialize PIO device + */ + kvm_iodevice_init(&s->dev, &picdev_ops); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) { + kfree(s); + return NULL; + } + + return s; +} + +void kvm_destroy_pic(struct kvm *kvm) +{ + struct kvm_pic *vpic = kvm->arch.vpic; + + if (vpic) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm->arch.vpic = NULL; + kfree(vpic); + } +} diff --git a/linux/x86/ioapic.c b/linux/x86/ioapic.c new file mode 100644 index 0000000..f19b670 --- /dev/null +++ b/linux/x86/ioapic.c @@ -0,0 +1,481 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * Copyright (C) 2001 MandrakeSoft S.A. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * MandrakeSoft S.A. + * 43, rue d'Aboukir + * 75002 Paris - France + * http://www.linux-mandrake.com/ + * http://www.mandrakesoft.com/ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Yunhong Jiang <yunhong.jiang@intel.com> + * Yaozu (Eddie) Dong <eddie.dong@intel.com> + * Based on Xen 3.1 code. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/smp.h> +#include <linux/hrtimer.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/current.h> +#include <trace/events/kvm.h> + +#include "ioapic.h" +#include "lapic.h" +#include "irq.h" + +#if 0 +#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) +#else +#define ioapic_debug(fmt, arg...) +#endif +static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq); + +static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, + unsigned long addr, + unsigned long length) +{ + unsigned long result = 0; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16) + | (IOAPIC_VERSION_ID & 0xff)); + break; + + case IOAPIC_REG_APIC_ID: + case IOAPIC_REG_ARB_ID: + result = ((ioapic->id & 0xf) << 24); + break; + + default: + { + u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; + u64 redir_content; + + ASSERT(redir_index < IOAPIC_NUM_PINS); + + redir_content = ioapic->redirtbl[redir_index].bits; + result = (ioapic->ioregsel & 0x1) ? + (redir_content >> 32) & 0xffffffff : + redir_content & 0xffffffff; + break; + } + } + + return result; +} + +static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) +{ + union kvm_ioapic_redirect_entry *pent; + int injected = -1; + + pent = &ioapic->redirtbl[idx]; + + if (!pent->fields.mask) { + injected = ioapic_deliver(ioapic, idx); + if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) + pent->fields.remote_irr = 1; + } + + return injected; +} + +static void update_handled_vectors(struct kvm_ioapic *ioapic) +{ + DECLARE_BITMAP(handled_vectors, 256); + int i; + + memset(handled_vectors, 0, sizeof(handled_vectors)); + for (i = 0; i < IOAPIC_NUM_PINS; ++i) + __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); + memcpy(ioapic->handled_vectors, handled_vectors, + sizeof(handled_vectors)); + smp_wmb(); +} + +static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) +{ + unsigned index; + bool mask_before, mask_after; + union kvm_ioapic_redirect_entry *e; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + /* Writes are ignored. */ + break; + + case IOAPIC_REG_APIC_ID: + ioapic->id = (val >> 24) & 0xf; + break; + + case IOAPIC_REG_ARB_ID: + break; + + default: + index = (ioapic->ioregsel - 0x10) >> 1; + + ioapic_debug("change redir index %x val %x\n", index, val); + if (index >= IOAPIC_NUM_PINS) + return; + e = &ioapic->redirtbl[index]; + mask_before = e->fields.mask; + if (ioapic->ioregsel & 1) { + e->bits &= 0xffffffff; + e->bits |= (u64) val << 32; + } else { + e->bits &= ~0xffffffffULL; + e->bits |= (u32) val; + e->fields.remote_irr = 0; + } + update_handled_vectors(ioapic); + mask_after = e->fields.mask; + if (mask_before != mask_after) + kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG + && ioapic->irr & (1 << index)) + ioapic_service(ioapic, index); + break; + } +} + +static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) +{ + union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; + struct kvm_lapic_irq irqe; + + ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " + "vector=%x trig_mode=%x\n", + entry->fields.dest, entry->fields.dest_mode, + entry->fields.delivery_mode, entry->fields.vector, + entry->fields.trig_mode); + + irqe.dest_id = entry->fields.dest_id; + irqe.vector = entry->fields.vector; + irqe.dest_mode = entry->fields.dest_mode; + irqe.trig_mode = entry->fields.trig_mode; + irqe.delivery_mode = entry->fields.delivery_mode << 8; + irqe.level = 1; + irqe.shorthand = 0; + +#ifdef CONFIG_X86 + /* Always delivery PIT interrupt to vcpu 0 */ + if (irq == 0) { + irqe.dest_mode = 0; /* Physical mode. */ + /* need to read apic_id from apic regiest since + * it can be rewritten */ + irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id; + } +#endif + return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); +} + +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) +{ + u32 old_irr; + u32 mask = 1 << irq; + union kvm_ioapic_redirect_entry entry; + int ret = 1; + + spin_lock(&ioapic->lock); + old_irr = ioapic->irr; + if (irq >= 0 && irq < IOAPIC_NUM_PINS) { + entry = ioapic->redirtbl[irq]; + level ^= entry.fields.polarity; + if (!level) + ioapic->irr &= ~mask; + else { + int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); + ioapic->irr |= mask; + if ((edge && old_irr != ioapic->irr) || + (!edge && !entry.fields.remote_irr)) + ret = ioapic_service(ioapic, irq); + else + ret = 0; /* report coalesced interrupt */ + } + trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); + } + spin_unlock(&ioapic->lock); + + return ret; +} + +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, + int trigger_mode) +{ + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.vector != vector) + continue; + + /* + * We are dropping lock while calling ack notifiers because ack + * notifier callbacks for assigned devices call into IOAPIC + * recursively. Since remote_irr is cleared only after call + * to notifiers if the same vector will be delivered while lock + * is dropped it will be put into irr and will be delivered + * after ack notifier returns. + */ + spin_unlock(&ioapic->lock); + kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); + spin_lock(&ioapic->lock); + + if (trigger_mode != IOAPIC_LEVEL_TRIG) + continue; + + ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); + ent->fields.remote_irr = 0; + if (!ent->fields.mask && (ioapic->irr & (1 << i))) + ioapic_service(ioapic, i); + } +} + +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + smp_rmb(); + if (!test_bit(vector, ioapic->handled_vectors)) + return; + spin_lock(&ioapic->lock); + __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); + spin_unlock(&ioapic->lock); +} + +static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_ioapic, dev); +} + +static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr) +{ + return ((addr >= ioapic->base_address && + (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); +} + +static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) +{ + struct kvm_ioapic *ioapic = to_ioapic(this); + u32 result; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; + + ioapic_debug("addr %lx\n", (unsigned long)addr); + ASSERT(!(addr & 0xf)); /* check alignment */ + + addr &= 0xff; + spin_lock(&ioapic->lock); + switch (addr) { + case IOAPIC_REG_SELECT: + result = ioapic->ioregsel; + break; + + case IOAPIC_REG_WINDOW: + result = ioapic_read_indirect(ioapic, addr, len); + break; + + default: + result = 0; + break; + } + spin_unlock(&ioapic->lock); + + switch (len) { + case 8: + *(u64 *) val = result; + break; + case 1: + case 2: + case 4: + memcpy(val, (char *)&result, len); + break; + default: + printk(KERN_WARNING "ioapic: wrong length %d\n", len); + } + return 0; +} + +static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct kvm_ioapic *ioapic = to_ioapic(this); + u32 data; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; + + ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", + (void*)addr, len, val); + ASSERT(!(addr & 0xf)); /* check alignment */ + + if (len == 4 || len == 8) + data = *(u32 *) val; + else { + printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); + return 0; + } + + addr &= 0xff; + spin_lock(&ioapic->lock); + switch (addr) { + case IOAPIC_REG_SELECT: + ioapic->ioregsel = data; + break; + + case IOAPIC_REG_WINDOW: + ioapic_write_indirect(ioapic, data); + break; +#ifdef CONFIG_IA64 + case IOAPIC_REG_EOI: + __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG); + break; +#endif + + default: + break; + } + spin_unlock(&ioapic->lock); + return 0; +} + +void kvm_ioapic_reset(struct kvm_ioapic *ioapic) +{ + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) + ioapic->redirtbl[i].fields.mask = 1; + ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; + ioapic->ioregsel = 0; + ioapic->irr = 0; + ioapic->id = 0; + update_handled_vectors(ioapic); +} + +static const struct kvm_io_device_ops ioapic_mmio_ops = { + .read = ioapic_mmio_read, + .write = ioapic_mmio_write, +}; + +int kvm_ioapic_init(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic; + int ret; + + ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); + if (!ioapic) + return -ENOMEM; + spin_lock_init(&ioapic->lock); + kvm->arch.vioapic = ioapic; + kvm_ioapic_reset(ioapic); + kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); + ioapic->kvm = kvm; + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) { + kvm->arch.vioapic = NULL; + kfree(ioapic); + } + + return ret; +} + +void kvm_ioapic_destroy(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + if (ioapic) { + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm->arch.vioapic = NULL; + kfree(ioapic); + } +} + +int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +{ + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + if (!ioapic) + return -EINVAL; + + spin_lock(&ioapic->lock); + memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); + spin_unlock(&ioapic->lock); + return 0; +} + +int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +{ + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + if (!ioapic) + return -EINVAL; + + spin_lock(&ioapic->lock); + memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); + update_handled_vectors(ioapic); + spin_unlock(&ioapic->lock); + return 0; +} diff --git a/linux/x86/ioapic.h b/linux/x86/ioapic.h new file mode 100644 index 0000000..0b190c3 --- /dev/null +++ b/linux/x86/ioapic.h @@ -0,0 +1,83 @@ +#ifndef __KVM_IO_APIC_H +#define __KVM_IO_APIC_H + +#include <linux/kvm_host.h> + +#include "iodev.h" + +struct kvm; +struct kvm_vcpu; + +#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ +#define IOAPIC_EDGE_TRIG 0 +#define IOAPIC_LEVEL_TRIG 1 + +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 +#define IOAPIC_MEM_LENGTH 0x100 + +/* Direct registers. */ +#define IOAPIC_REG_SELECT 0x00 +#define IOAPIC_REG_WINDOW 0x10 +#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ + +/* Indirect registers. */ +#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ +#define IOAPIC_REG_VERSION 0x01 +#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ + +/*ioapic delivery mode*/ +#define IOAPIC_FIXED 0x0 +#define IOAPIC_LOWEST_PRIORITY 0x1 +#define IOAPIC_PMI 0x2 +#define IOAPIC_NMI 0x4 +#define IOAPIC_INIT 0x5 +#define IOAPIC_EXTINT 0x7 + +struct kvm_ioapic { + u64 base_address; + u32 ioregsel; + u32 id; + u32 irr; + u32 pad; + union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; + unsigned long irq_states[IOAPIC_NUM_PINS]; + struct kvm_io_device dev; + struct kvm *kvm; + void (*ack_notifier)(void *opaque, int irq); + spinlock_t lock; + DECLARE_BITMAP(handled_vectors, 256); +}; + +#ifdef DEBUG +#define ASSERT(x) \ +do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } \ +} while (0) +#else +#define ASSERT(x) do { } while (0) +#endif + +static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) +{ + return kvm->arch.vioapic; +} + +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, int dest, int dest_mode); +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); +int kvm_ioapic_init(struct kvm *kvm); +void kvm_ioapic_destroy(struct kvm *kvm); +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); +void kvm_ioapic_reset(struct kvm_ioapic *ioapic); +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq); +int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); + +#endif diff --git a/linux/x86/iodev.h b/linux/x86/iodev.h new file mode 100644 index 0000000..12fd3ca --- /dev/null +++ b/linux/x86/iodev.h @@ -0,0 +1,70 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __KVM_IODEV_H__ +#define __KVM_IODEV_H__ + +#include <linux/kvm_types.h> +#include <asm/errno.h> + +struct kvm_io_device; + +/** + * kvm_io_device_ops are called under kvm slots_lock. + * read and write handlers return 0 if the transaction has been handled, + * or non-zero to have it passed to the next device. + **/ +struct kvm_io_device_ops { + int (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + int (*write)(struct kvm_io_device *this, + gpa_t addr, + int len, + const void *val); + void (*destructor)(struct kvm_io_device *this); +}; + + +struct kvm_io_device { + const struct kvm_io_device_ops *ops; +}; + +static inline void kvm_iodevice_init(struct kvm_io_device *dev, + const struct kvm_io_device_ops *ops) +{ + dev->ops = ops; +} + +static inline int kvm_iodevice_read(struct kvm_io_device *dev, + gpa_t addr, int l, void *v) +{ + return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP; +} + +static inline int kvm_iodevice_write(struct kvm_io_device *dev, + gpa_t addr, int l, const void *v) +{ + return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP; +} + +static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) +{ + if (dev->ops->destructor) + dev->ops->destructor(dev); +} + +#endif /* __KVM_IODEV_H__ */ diff --git a/linux/x86/iommu.c b/linux/x86/iommu.c new file mode 100644 index 0000000..381142f --- /dev/null +++ b/linux/x86/iommu.c @@ -0,0 +1,356 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Author: Allen M. Kay <allen.m.kay@intel.com> + * Author: Weidong Han <weidong.han@intel.com> + * Author: Ben-Ami Yassour <benami@il.ibm.com> + */ + +#include <linux/list.h> +#include <linux/kvm_host.h> +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/iommu.h> +#include <linux/intel-iommu.h> + +static int kvm_iommu_unmap_memslots(struct kvm *kvm); +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages); + +static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long size) +{ + gfn_t end_gfn; + pfn_t pfn; + + pfn = gfn_to_pfn_memslot(kvm, slot, gfn); + end_gfn = gfn + (size >> PAGE_SHIFT); + gfn += 1; + + if (is_error_pfn(pfn)) + return pfn; + + while (gfn < end_gfn) + gfn_to_pfn_memslot(kvm, slot, gfn++); + + return pfn; +} + +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + gfn_t gfn, end_gfn; + pfn_t pfn; + int r = 0; + struct iommu_domain *domain = kvm->arch.iommu_domain; + int flags; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + gfn = slot->base_gfn; + end_gfn = gfn + slot->npages; + + flags = IOMMU_READ | IOMMU_WRITE; + if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) + flags |= IOMMU_CACHE; + + + while (gfn < end_gfn) { + unsigned long page_size; + + /* Check if already mapped */ + if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { + gfn += 1; + continue; + } + + /* Get the page size we could use to map */ + page_size = kvm_host_page_size(kvm, gfn); + + /* Make sure the page_size does not exceed the memslot */ + while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) + page_size >>= 1; + + /* Make sure gfn is aligned to the page size we want to map */ + while ((gfn << PAGE_SHIFT) & (page_size - 1)) + page_size >>= 1; + + /* + * Pin all pages we are about to map in memory. This is + * important because we unmap and unpin in 4kb steps later. + */ + pfn = kvm_pin_pages(kvm, slot, gfn, page_size); + if (is_error_pfn(pfn)) { + gfn += 1; + continue; + } + + /* Map into IO address space */ + r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), + get_order(page_size), flags); + if (r) { + printk(KERN_ERR "kvm_iommu_map_address:" + "iommu failed to map pfn=%llx\n", pfn); + goto unmap_pages; + } + + gfn += page_size >> PAGE_SHIFT; + + + } + + return 0; + +unmap_pages: + kvm_iommu_put_pages(kvm, slot->base_gfn, gfn); + return r; +} + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int i, idx, r = 0; + struct kvm_memslots *slots; + + idx = srcu_read_lock(&kvm->srcu); + slots = kvm_memslots(kvm); + + for (i = 0; i < slots->nmemslots; i++) { + r = kvm_iommu_map_pages(kvm, &slots->memslots[i]); + if (r) + break; + } + srcu_read_unlock(&kvm->srcu, idx); + + return r; +} + +int kvm_assign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + struct pci_dev *pdev = NULL; + struct iommu_domain *domain = kvm->arch.iommu_domain; + int r, last_flags; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + pdev = assigned_dev->dev; + if (pdev == NULL) + return -ENODEV; + + r = iommu_attach_device(domain, &pdev->dev); + if (r) { + printk(KERN_ERR "assign device %x:%x:%x.%x failed", + pci_domain_nr(pdev->bus), + pdev->bus->number, + PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn)); + return r; + } + + last_flags = kvm->arch.iommu_flags; + if (iommu_domain_has_cap(kvm->arch.iommu_domain, + IOMMU_CAP_CACHE_COHERENCY)) + kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY; + + /* Check if need to update IOMMU page table for guest memory */ + if ((last_flags ^ kvm->arch.iommu_flags) == + KVM_IOMMU_CACHE_COHERENCY) { + kvm_iommu_unmap_memslots(kvm); + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + } + + printk(KERN_DEBUG "assign device %x:%x:%x.%x\n", + assigned_dev->host_segnr, + assigned_dev->host_busnr, + PCI_SLOT(assigned_dev->host_devfn), + PCI_FUNC(assigned_dev->host_devfn)); + + return 0; +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +int kvm_deassign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + struct iommu_domain *domain = kvm->arch.iommu_domain; + struct pci_dev *pdev = NULL; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + pdev = assigned_dev->dev; + if (pdev == NULL) + return -ENODEV; + + iommu_detach_device(domain, &pdev->dev); + + printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n", + assigned_dev->host_segnr, + assigned_dev->host_busnr, + PCI_SLOT(assigned_dev->host_devfn), + PCI_FUNC(assigned_dev->host_devfn)); + + return 0; +} + +int kvm_iommu_map_guest(struct kvm *kvm) +{ + int r; + + if (!iommu_found()) { + printk(KERN_ERR "%s: iommu not found\n", __func__); + return -ENODEV; + } + + kvm->arch.iommu_domain = iommu_domain_alloc(); + if (!kvm->arch.iommu_domain) + return -ENOMEM; + + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + + return 0; + +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) +{ + unsigned long i; + + for (i = 0; i < npages; ++i) + kvm_release_pfn_clean(pfn + i); +} + +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + struct iommu_domain *domain; + gfn_t end_gfn, gfn; + pfn_t pfn; + u64 phys; + + domain = kvm->arch.iommu_domain; + end_gfn = base_gfn + npages; + gfn = base_gfn; + + /* check if iommu exists and in use */ + if (!domain) + return; + + while (gfn < end_gfn) { + unsigned long unmap_pages; + int order; + + /* Get physical address */ + phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); + pfn = phys >> PAGE_SHIFT; + + /* Unmap address from IO address space */ + order = iommu_unmap(domain, gfn_to_gpa(gfn), 0); + unmap_pages = 1ULL << order; + + /* Unpin all pages we just unmapped to not leak any memory */ + kvm_unpin_pages(kvm, pfn, unmap_pages); + + gfn += unmap_pages; + } +} + +static int kvm_iommu_unmap_memslots(struct kvm *kvm) +{ + int i, idx; + struct kvm_memslots *slots; + + idx = srcu_read_lock(&kvm->srcu); + slots = kvm_memslots(kvm); + + for (i = 0; i < slots->nmemslots; i++) { + kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn, + slots->memslots[i].npages); + } + srcu_read_unlock(&kvm->srcu, idx); + + return 0; +} + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct iommu_domain *domain = kvm->arch.iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + kvm_iommu_unmap_memslots(kvm); + iommu_domain_free(domain); + return 0; +} diff --git a/linux/x86/irq.c b/linux/x86/irq.c new file mode 100644 index 0000000..d1372c0 --- /dev/null +++ b/linux/x86/irq.c @@ -0,0 +1,136 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * irq.c: API for in kernel interrupt controller + * Copyright (c) 2007, Intel Corporation. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + */ + +#include <linux/module.h> +#include <linux/kvm_host.h> + +#include "irq.h" +#include "i8254.h" +#include "x86.h" + +/* + * check if there are pending timer events + * to be processed. + */ +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return apic_has_pending_timer(vcpu); +} +EXPORT_SYMBOL(kvm_cpu_has_pending_timer); + +/* + * check if there is pending interrupt without + * intack. + */ +int kvm_cpu_has_interrupt(struct kvm_vcpu *v) +{ + struct kvm_pic *s; + + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.pending; + + if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ + if (kvm_apic_accept_pic_intr(v)) { + s = pic_irqchip(v->kvm); /* PIC */ + return s->output; + } else + return 0; + } + return 1; +} +EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); + +/* + * Read pending interrupt vector and intack. + */ +int kvm_cpu_get_interrupt(struct kvm_vcpu *v) +{ + struct kvm_pic *s; + int vector; + + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.nr; + + vector = kvm_get_apic_interrupt(v); /* APIC */ + if (vector == -1) { + if (kvm_apic_accept_pic_intr(v)) { + s = pic_irqchip(v->kvm); + s->output = 0; /* PIC */ + vector = kvm_pic_read_irq(v->kvm); + } + } + return vector; +} +EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); + +void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) +{ + kvm_inject_apic_timer_irqs(vcpu); + /* TODO: PIT, RTC etc. */ +} +EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); + +void __kvm_migrate_timers(struct kvm_vcpu *vcpu) +{ + __kvm_migrate_apic_timer(vcpu); + __kvm_migrate_pit_timer(vcpu); +} diff --git a/linux/x86/irq.h b/linux/x86/irq.h new file mode 100644 index 0000000..90f1923 --- /dev/null +++ b/linux/x86/irq.h @@ -0,0 +1,106 @@ +/* + * irq.h: in kernel interrupt controller related definitions + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + */ + +#ifndef __IRQ_H +#define __IRQ_H + +#include <linux/mm.h> +#include <linux/hrtimer.h> +#include <linux/kvm_host.h> +#include <linux/spinlock.h> + +#include "iodev.h" +#include "ioapic.h" +#include "lapic.h" + +#define PIC_NUM_PINS 16 +#define SELECT_PIC(irq) \ + ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) + +struct kvm; +struct kvm_vcpu; + +struct kvm_kpic_state { + u8 last_irr; /* edge detection */ + u8 irr; /* interrupt request register */ + u8 imr; /* interrupt mask register */ + u8 isr; /* interrupt service register */ + u8 priority_add; /* highest irq priority */ + u8 irq_base; + u8 read_reg_select; + u8 poll; + u8 special_mask; + u8 init_state; + u8 auto_eoi; + u8 rotate_on_auto_eoi; + u8 special_fully_nested_mode; + u8 init4; /* true if 4 byte init */ + u8 elcr; /* PIIX edge/trigger selection */ + u8 elcr_mask; + u8 isr_ack; /* interrupt ack detection */ + struct kvm_pic *pics_state; +}; + +struct kvm_pic { + spinlock_t lock; + bool wakeup_needed; + unsigned pending_acks; + struct kvm *kvm; + struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ + int output; /* intr from master PIC */ + struct kvm_io_device dev; + void (*ack_notifier)(void *opaque, int irq); + unsigned long irq_states[16]; +}; + +struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm *kvm); +int kvm_pic_read_irq(struct kvm *kvm); +void kvm_pic_update_irq(struct kvm_pic *s); +void kvm_pic_clear_isr_ack(struct kvm *kvm); + +static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) +{ + return kvm->arch.vpic; +} + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + int ret; + + ret = (pic_irqchip(kvm) != NULL); + smp_rmb(); + return ret; +} + +void kvm_pic_reset(struct kvm_kpic_state *s); + +void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); +void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); +void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); +void __kvm_migrate_timers(struct kvm_vcpu *vcpu); + +int pit_has_pending_timer(struct kvm_vcpu *vcpu); +int apic_has_pending_timer(struct kvm_vcpu *vcpu); + +#endif diff --git a/linux/x86/irq_comm.c b/linux/x86/irq_comm.c new file mode 100644 index 0000000..646d331 --- /dev/null +++ b/linux/x86/irq_comm.c @@ -0,0 +1,514 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * irq_comm.c: Common API for in kernel interrupt controller + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <trace/events/kvm.h> + +#include <asm/msidef.h> +#ifdef CONFIG_IA64 +#include <asm/iosapic.h> +#endif + +#include "irq.h" + +#include "ioapic.h" + +static inline int kvm_irq_line_state(unsigned long *irq_state, + int irq_source_id, int level) +{ + /* Logical OR for level trig interrupt */ + if (level) + set_bit(irq_source_id, irq_state); + else + clear_bit(irq_source_id, irq_state); + + return !!(*irq_state); +} + +static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level) +{ +#ifdef CONFIG_X86 + struct kvm_pic *pic = pic_irqchip(kvm); + level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin], + irq_source_id, level); + return kvm_pic_set_irq(pic, e->irqchip.pin, level); +#else + return -1; +#endif +} + +static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin], + irq_source_id, level); + + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level); +} + +inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) +{ +#ifdef CONFIG_IA64 + return irq->delivery_mode == + (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT); +#else + return irq->delivery_mode == APIC_DM_LOWEST; +#endif +} + +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq) +{ + int i, r = -1; + struct kvm_vcpu *vcpu, *lowest = NULL; + + if (irq->dest_mode == 0 && irq->dest_id == 0xff && + kvm_is_dm_lowest_prio(irq)) + printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_is_dm_lowest_prio(irq)) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, irq); + } else if (kvm_lapic_enabled(vcpu)) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } + } + + if (lowest) + r = kvm_apic_set_irq(lowest, irq); + + return r; +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level) +{ + struct kvm_lapic_irq irq; + + if (!level) + return -1; + + trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + + irq.dest_id = (e->msi.address_lo & + MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + irq.vector = (e->msi.data & + MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; + irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; + irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; + irq.delivery_mode = e->msi.data & 0x700; + irq.level = 1; + irq.shorthand = 0; + + /* TODO Deal with RH bit of MSI message address */ + return kvm_irq_delivery_to_apic(kvm, NULL, &irq); +} + +/* + * Return value: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level) +{ + struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; + int ret = -1, i = 0; + struct kvm_irq_routing_table *irq_rt; + struct hlist_node *n; + + trace_kvm_set_irq(irq, level, irq_source_id); + + /* Not possible to detect if the guest uses the PIC or the + * IOAPIC. So set the bit in both. The guest will ignore + * writes to the unused one. + */ + rcu_read_lock(); + irq_rt = rcu_dereference(kvm->irq_routing); + if (irq < irq_rt->nr_rt_entries) + hlist_for_each_entry(e, n, &irq_rt->map[irq], link) + irq_set[i++] = *e; + rcu_read_unlock(); + + while(i--) { + int r; + r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level); + if (r < 0) + continue; + + ret = r + ((ret < 0) ? 0 : ret); + } + + return ret; +} + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ + struct kvm_irq_ack_notifier *kian; + struct hlist_node *n; + int gsi; + + trace_kvm_ack_irq(irqchip, pin); + + rcu_read_lock(); + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + if (gsi != -1) + hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list, + link) + if (kian->gsi == gsi) + kian->irq_acked(kian); + rcu_read_unlock(); +} + +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + mutex_lock(&kvm->irq_lock); + hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); + mutex_unlock(&kvm->irq_lock); +} + +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + mutex_lock(&kvm->irq_lock); + hlist_del_init_rcu(&kian->link); + mutex_unlock(&kvm->irq_lock); + synchronize_rcu(); +} + +int kvm_request_irq_source_id(struct kvm *kvm) +{ + unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; + int irq_source_id; + + mutex_lock(&kvm->irq_lock); + irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); + + if (irq_source_id >= BITS_PER_LONG) { + printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); + irq_source_id = -EFAULT; + goto unlock; + } + + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + set_bit(irq_source_id, bitmap); +unlock: + mutex_unlock(&kvm->irq_lock); + + return irq_source_id; +} + +void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) +{ + int i; + + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + + mutex_lock(&kvm->irq_lock); + if (irq_source_id < 0 || + irq_source_id >= BITS_PER_LONG) { + printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); + goto unlock; + } + clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); + if (!irqchip_in_kernel(kvm)) + goto unlock; + + for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) { + clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]); + if (i >= 16) + continue; +#ifdef CONFIG_X86 + clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]); +#endif + } +unlock: + mutex_unlock(&kvm->irq_lock); +} + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + mutex_lock(&kvm->irq_lock); + kimn->irq = irq; + hlist_add_head_rcu(&kimn->link, &kvm->mask_notifier_list); + mutex_unlock(&kvm->irq_lock); +} + +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + mutex_lock(&kvm->irq_lock); + hlist_del_rcu(&kimn->link); + mutex_unlock(&kvm->irq_lock); + synchronize_rcu(); +} + +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask) +{ + struct kvm_irq_mask_notifier *kimn; + struct hlist_node *n; + int gsi; + + rcu_read_lock(); + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + if (gsi != -1) + hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link) + if (kimn->irq == gsi) + kimn->func(kimn, mask); + rcu_read_unlock(); +} + +void kvm_free_irq_routing(struct kvm *kvm) +{ + /* Called only during vm destruction. Nobody can use the pointer + at this stage */ + kfree(kvm->irq_routing); +} + +static int setup_routing_entry(struct kvm_irq_routing_table *rt, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + int delta; + unsigned max_pin; + struct kvm_kernel_irq_routing_entry *ei; + struct hlist_node *n; + + /* + * Do not allow GSI to be mapped to the same irqchip more than once. + * Allow only one to one mapping between GSI and MSI. + */ + hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link) + if (ei->type == KVM_IRQ_ROUTING_MSI || + ue->u.irqchip.irqchip == ei->irqchip.irqchip) + return r; + + e->gsi = ue->gsi; + e->type = ue->type; + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + delta = 0; + switch (ue->u.irqchip.irqchip) { + case KVM_IRQCHIP_PIC_MASTER: + e->set = kvm_set_pic_irq; + max_pin = 16; + break; + case KVM_IRQCHIP_PIC_SLAVE: + e->set = kvm_set_pic_irq; + max_pin = 16; + delta = 8; + break; + case KVM_IRQCHIP_IOAPIC: + max_pin = KVM_IOAPIC_NUM_PINS; + e->set = kvm_set_ioapic_irq; + break; + default: + goto out; + } + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin + delta; + if (e->irqchip.pin >= max_pin) + goto out; + rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + break; + default: + goto out; + } + + hlist_add_head(&e->link, &rt->map[e->gsi]); + r = 0; +out: + return r; +} + + +int kvm_set_irq_routing(struct kvm *kvm, + const struct kvm_irq_routing_entry *ue, + unsigned nr, + unsigned flags) +{ + struct kvm_irq_routing_table *new, *old; + u32 i, j, nr_rt_entries = 0; + int r; + + for (i = 0; i < nr; ++i) { + if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) + return -EINVAL; + nr_rt_entries = max(nr_rt_entries, ue[i].gsi); + } + + nr_rt_entries += 1; + + new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)) + + (nr * sizeof(struct kvm_kernel_irq_routing_entry)), + GFP_KERNEL); + + if (!new) + return -ENOMEM; + + new->rt_entries = (void *)&new->map[nr_rt_entries]; + + new->nr_rt_entries = nr_rt_entries; + for (i = 0; i < 3; i++) + for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++) + new->chip[i][j] = -1; + + for (i = 0; i < nr; ++i) { + r = -EINVAL; + if (ue->flags) + goto out; + r = setup_routing_entry(new, &new->rt_entries[i], ue); + if (r) + goto out; + ++ue; + } + + mutex_lock(&kvm->irq_lock); + old = kvm->irq_routing; + kvm_irq_routing_update(kvm, new); + mutex_unlock(&kvm->irq_lock); + + synchronize_rcu(); + + new = old; + r = 0; + +out: + kfree(new); + return r; +} + +#define IOAPIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) } +#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) + +#ifdef CONFIG_X86 +# define PIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip.irqchip = SELECT_PIC(irq), .u.irqchip.pin = (irq) % 8 } +# define ROUTING_ENTRY2(irq) \ + IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) +#else +# define ROUTING_ENTRY2(irq) \ + IOAPIC_ROUTING_ENTRY(irq) +#endif + +static const struct kvm_irq_routing_entry default_routing[] = { + ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), + ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), + ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), + ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), + ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), + ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), + ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), + ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), + ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), + ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), + ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), + ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), +#ifdef CONFIG_IA64 + ROUTING_ENTRY1(24), ROUTING_ENTRY1(25), + ROUTING_ENTRY1(26), ROUTING_ENTRY1(27), + ROUTING_ENTRY1(28), ROUTING_ENTRY1(29), + ROUTING_ENTRY1(30), ROUTING_ENTRY1(31), + ROUTING_ENTRY1(32), ROUTING_ENTRY1(33), + ROUTING_ENTRY1(34), ROUTING_ENTRY1(35), + ROUTING_ENTRY1(36), ROUTING_ENTRY1(37), + ROUTING_ENTRY1(38), ROUTING_ENTRY1(39), + ROUTING_ENTRY1(40), ROUTING_ENTRY1(41), + ROUTING_ENTRY1(42), ROUTING_ENTRY1(43), + ROUTING_ENTRY1(44), ROUTING_ENTRY1(45), + ROUTING_ENTRY1(46), ROUTING_ENTRY1(47), +#endif +}; + +int kvm_setup_default_irq_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, default_routing, + ARRAY_SIZE(default_routing), 0); +} diff --git a/linux/x86/kvm_cache_regs.h b/linux/x86/kvm_cache_regs.h new file mode 100644 index 0000000..3377d53 --- /dev/null +++ b/linux/x86/kvm_cache_regs.h @@ -0,0 +1,109 @@ +#ifndef ASM_KVM_CACHE_REGS_H +#define ASM_KVM_CACHE_REGS_H + +#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR4_GUEST_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) + +static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, reg); + + return vcpu->arch.regs[reg]; +} + +static inline void kvm_register_write(struct kvm_vcpu *vcpu, + enum kvm_reg reg, + unsigned long val) +{ + vcpu->arch.regs[reg] = val; + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) +{ + return kvm_register_read(vcpu, VCPU_REGS_RIP); +} + +static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) +{ + kvm_register_write(vcpu, VCPU_REGS_RIP, val); +} + +static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) +{ + might_sleep(); /* on svm */ + + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); + + return vcpu->arch.walk_mmu->pdptrs[index]; +} + +static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index) +{ + load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu)); + + return mmu->pdptrs[index]; +} + +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; + if (tmask & vcpu->arch.cr0_guest_owned_bits) + kvm_x86_ops->decache_cr0_guest_bits(vcpu); + return vcpu->arch.cr0 & mask; +} + +static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, ~0UL); +} + +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; + if (tmask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) +{ + if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + kvm_x86_ops->decache_cr3(vcpu); + return vcpu->arch.cr3; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + +static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) +{ + return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) + | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); +} + +static inline void enter_guest_mode(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hflags |= HF_GUEST_MASK; +} + +static inline void leave_guest_mode(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hflags &= ~HF_GUEST_MASK; +} + +static inline bool is_guest_mode(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hflags & HF_GUEST_MASK; +} + +#endif diff --git a/linux/x86/kvm_main.c b/linux/x86/kvm_main.c new file mode 100644 index 0000000..355948e --- /dev/null +++ b/linux/x86/kvm_main.c @@ -0,0 +1,2679 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "iodev.h" + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/percpu.h> +#include <linux/mm.h> +#include <linux/miscdevice.h> +#include <linux/vmalloc.h> +#include <linux/reboot.h> +#include <linux/debugfs.h> +#include <linux/highmem.h> +#include <linux/file.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/smp.h> +#include <linux/anon_inodes.h> +#include <linux/profile.h> +#include <linux/kvm_para.h> +#include <linux/pagemap.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> +#include <linux/compat.h> +#include <linux/srcu.h> +#include <linux/hugetlb.h> +#include <linux/slab.h> + +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm-generic/bitops/le.h> + +#include "coalesced_mmio.h" +#include "async_pf.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/kvm.h> + +MODULE_INFO(version, "kvm-kmod-2.6.38-rc7"); +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +/* + * Ordering of locks: + * + * kvm->lock --> kvm->slots_lock --> kvm->irq_lock + */ + +DEFINE_SPINLOCK(kvm_lock); +LIST_HEAD(vm_list); + +static cpumask_var_t cpus_hardware_enabled; +static int kvm_usage_count = 0; +static atomic_t hardware_enable_failed; + +struct kmem_cache *kvm_vcpu_cache; +EXPORT_SYMBOL_GPL(kvm_vcpu_cache); + +static __read_mostly struct preempt_ops kvm_preempt_ops; + +struct dentry *kvm_debugfs_dir; + +static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, + unsigned long arg); +static int hardware_enable_all(void); +static void hardware_disable_all(void); + +static void kvm_io_bus_destroy(struct kvm_io_bus *bus); + +bool kvm_rebooting; +EXPORT_SYMBOL_GPL(kvm_rebooting); + +static bool largepages_enabled = true; + +static struct page *hwpoison_page; +static pfn_t hwpoison_pfn; + +static struct page *fault_page; +static pfn_t fault_pfn; + +inline int kvm_is_mmio_pfn(pfn_t pfn) +{ + if (pfn_valid(pfn)) { + int reserved; + struct page *tail = pfn_to_page(pfn); + struct page *head = compound_trans_head(tail); + reserved = PageReserved(head); + if (head != tail) { + /* + * "head" is not a dangling pointer + * (compound_trans_head takes care of that) + * but the hugepage may have been splitted + * from under us (and we may not hold a + * reference count on the head page so it can + * be reused before we run PageReferenced), so + * we've to check PageTail before returning + * what we just read. + */ + smp_rmb(); + if (PageTail(tail)) + return reserved; + } + return PageReserved(tail); + } + + return true; +} + +/* + * Switches to specified vcpu, until a matching vcpu_put() + */ +void vcpu_load(struct kvm_vcpu *vcpu) +{ + int cpu; + + mutex_lock(&vcpu->mutex); + cpu = get_cpu(); + preempt_notifier_register(&vcpu->preempt_notifier); + kvm_arch_vcpu_load(vcpu, cpu); + put_cpu(); +} + +void vcpu_put(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + kvm_fire_urn(); + preempt_notifier_unregister(&vcpu->preempt_notifier); + preempt_enable(); + mutex_unlock(&vcpu->mutex); +} + +static void ack_flush(void *_completed) +{ +} + +static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) +{ + int i, cpu, me; + cpumask_var_t cpus; + bool called = true; + struct kvm_vcpu *vcpu; + + zalloc_cpumask_var(&cpus, GFP_ATOMIC); + + raw_spin_lock(&kvm->requests_lock); + me = smp_processor_id(); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (kvm_make_check_request(req, vcpu)) + continue; + cpu = vcpu->cpu; + if (cpus != NULL && cpu != -1 && cpu != me) + cpumask_set_cpu(cpu, cpus); + } + if (unlikely(cpus == NULL)) + smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); + else if (!cpumask_empty(cpus)) + smp_call_function_many(cpus, ack_flush, NULL, 1); + else + called = false; + raw_spin_unlock(&kvm->requests_lock); + free_cpumask_var(cpus); + return called; +} + +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + int dirty_count = kvm->tlbs_dirty; + + smp_mb(); + if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) + ++kvm->stat.remote_tlb_flush; + cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); +} + +void kvm_reload_remote_mmus(struct kvm *kvm) +{ + make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); +} + +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +{ + struct page *page; + int r; + + mutex_init(&vcpu->mutex); + vcpu->cpu = -1; + vcpu->kvm = kvm; + vcpu->vcpu_id = id; + init_waitqueue_head(&vcpu->wq); + kvm_async_pf_vcpu_init(vcpu); + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail; + } + vcpu->run = page_address(page); + + r = kvm_arch_vcpu_init(vcpu); + if (r < 0) + goto fail_free_run; + return 0; + +fail_free_run: + free_page((unsigned long)vcpu->run); +fail: + return r; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_init); + +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kvm_arch_vcpu_uninit(vcpu); + free_page((unsigned long)vcpu->run); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); + +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) +{ + return container_of(mn, struct kvm, mmu_notifier); +} + +static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int need_tlb_flush, idx; + + /* + * When ->invalidate_page runs, the linux pte has been zapped + * already but the page is still allocated until + * ->invalidate_page returns. So if we increase the sequence + * here the kvm page fault will notice if the spte can't be + * established because the page is going to be freed. If + * instead the kvm page fault establishes the spte before + * ->invalidate_page runs, kvm_unmap_hva will release it + * before returning. + * + * The sequence increase only need to be seen at spin_unlock + * time, and not at spin_lock time. + * + * Increasing the sequence after the spin_unlock would be + * unsafe because the kvm page fault could then establish the + * pte after kvm_unmap_hva returned, without noticing the page + * is going to be freed. + */ + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + kvm->mmu_notifier_seq++; + need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + /* we've to flush the tlb before the pages can be freed */ + if (need_tlb_flush) + kvm_flush_remote_tlbs(kvm); + +} + +#ifdef MMU_NOTIFIER_HAS_CHANGE_PTE +static +#endif +void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + kvm->mmu_notifier_seq++; + kvm_set_spte_hva(kvm, address, pte); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); +} + +static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int need_tlb_flush = 0, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + /* + * The count increase must become visible at unlock time as no + * spte can be established without taking the mmu_lock and + * count is also read inside the mmu_lock critical section. + */ + kvm->mmu_notifier_count++; + for (; start < end; start += PAGE_SIZE) + need_tlb_flush |= kvm_unmap_hva(kvm, start); + need_tlb_flush |= kvm->tlbs_dirty; + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + /* we've to flush the tlb before the pages can be freed */ + if (need_tlb_flush) + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + + spin_lock(&kvm->mmu_lock); + /* + * This sequence increase will notify the kvm page fault that + * the page that is going to be mapped in the spte could have + * been freed. + */ + kvm->mmu_notifier_seq++; + /* + * The above sequence increase must be visible before the + * below count decrease but both values are read by the kvm + * page fault under mmu_lock spinlock so we don't need to add + * a smb_wmb() here in between the two. + */ + kvm->mmu_notifier_count--; + spin_unlock(&kvm->mmu_lock); + + BUG_ON(kvm->mmu_notifier_count < 0); +} + +static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + young = kvm_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + if (young) + kvm_flush_remote_tlbs(kvm); + + return young; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38) +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + young = kvm_test_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + return young; +} +#endif + +static void kvm_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int idx; + + idx = srcu_read_lock(&kvm->srcu); + kvm_arch_flush_shadow(kvm); + srcu_read_unlock(&kvm->srcu, idx); +} + +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .invalidate_page = kvm_mmu_notifier_invalidate_page, + .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, + .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, + .clear_flush_young = kvm_mmu_notifier_clear_flush_young, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38) + .test_young = kvm_mmu_notifier_test_young, +#endif +#ifdef MMU_NOTIFIER_HAS_CHANGE_PTE + .change_pte = kvm_mmu_notifier_change_pte, +#endif + .release = kvm_mmu_notifier_release, +}; + +static int kvm_init_mmu_notifier(struct kvm *kvm) +{ + kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; + return mmu_notifier_register(&kvm->mmu_notifier, current->mm); +} + +#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ + +static int kvm_init_mmu_notifier(struct kvm *kvm) +{ + return 0; +} + +#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ + +static struct kvm *kvm_create_vm(void) +{ + int r, i; + struct kvm *kvm = kvm_arch_alloc_vm(); + + if (!kvm) + return ERR_PTR(-ENOMEM); + + r = kvm_arch_init_vm(kvm); + if (r) + goto out_err_nodisable; + + r = hardware_enable_all(); + if (r) + goto out_err_nodisable; + +#ifdef CONFIG_HAVE_KVM_IRQCHIP + INIT_HLIST_HEAD(&kvm->mask_notifier_list); + INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); +#endif + + r = -ENOMEM; + kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!kvm->memslots) + goto out_err_nosrcu; + if (init_srcu_struct(&kvm->srcu)) + goto out_err_nosrcu; + for (i = 0; i < KVM_NR_BUSES; i++) { + kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), + GFP_KERNEL); + if (!kvm->buses[i]) + goto out_err; + } + + r = kvm_init_mmu_notifier(kvm); + if (r) + goto out_err; + + kvm->mm = current->mm; + mmget(&kvm->mm->mm_count); + spin_lock_init(&kvm->mmu_lock); + raw_spin_lock_init(&kvm->requests_lock); + kvm_eventfd_init(kvm); + mutex_init(&kvm->lock); + mutex_init(&kvm->irq_lock); + mutex_init(&kvm->slots_lock); + atomic_set(&kvm->users_count, 1); + spin_lock(&kvm_lock); + list_add(&kvm->vm_list, &vm_list); + spin_unlock(&kvm_lock); + + return kvm; + +out_err: + cleanup_srcu_struct(&kvm->srcu); +out_err_nosrcu: + hardware_disable_all(); +out_err_nodisable: + for (i = 0; i < KVM_NR_BUSES; i++) + kfree(kvm->buses[i]); + kfree(kvm->memslots); + kvm_arch_free_vm(kvm); + return ERR_PTR(r); +} + +static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) +{ + if (!memslot->dirty_bitmap) + return; + + if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) + vfree(memslot->dirty_bitmap_head); + else + kfree(memslot->dirty_bitmap_head); + + memslot->dirty_bitmap = NULL; + memslot->dirty_bitmap_head = NULL; +} + +/* + * Free any memory in @free but not in @dont. + */ +static void kvm_free_physmem_slot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + if (!dont || free->rmap != dont->rmap) + vfree(free->rmap); + + if (!dont || free->dirty_bitmap != dont->dirty_bitmap) + kvm_destroy_dirty_bitmap(free); + + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { + vfree(free->lpage_info[i]); + free->lpage_info[i] = NULL; + } + } + + free->npages = 0; + free->rmap = NULL; +} + +void kvm_free_physmem(struct kvm *kvm) +{ + int i; + struct kvm_memslots *slots = kvm->memslots; + + for (i = 0; i < slots->nmemslots; ++i) + kvm_free_physmem_slot(&slots->memslots[i], NULL); + + kfree(kvm->memslots); +} + +static void kvm_destroy_vm(struct kvm *kvm) +{ + int i; + struct mm_struct *mm = kvm->mm; + + kvm_arch_sync_events(kvm); + spin_lock(&kvm_lock); + list_del(&kvm->vm_list); + spin_unlock(&kvm_lock); + kvm_free_irq_routing(kvm); + for (i = 0; i < KVM_NR_BUSES; i++) + kvm_io_bus_destroy(kvm->buses[i]); + kvm_coalesced_mmio_free(kvm); +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); +#else + kvm_arch_flush_shadow(kvm); +#endif + kvm_arch_destroy_vm(kvm); + kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); + kvm_arch_free_vm(kvm); + hardware_disable_all(); + mmdrop(mm); +} + +void kvm_get_kvm(struct kvm *kvm) +{ + atomic_inc(&kvm->users_count); +} +EXPORT_SYMBOL_GPL(kvm_get_kvm); + +void kvm_put_kvm(struct kvm *kvm) +{ + if (atomic_dec_and_test(&kvm->users_count)) + kvm_destroy_vm(kvm); +} +EXPORT_SYMBOL_GPL(kvm_put_kvm); + + +static int kvm_vm_release(struct inode *inode, struct file *filp) +{ + struct kvm *kvm = filp->private_data; + + kvm_irqfd_release(kvm); + + kvm_put_kvm(kvm); + return 0; +} + +/* + * Allocation size is twice as large as the actual dirty bitmap size. + * This makes it possible to do double buffering: see x86's + * kvm_vm_ioctl_get_dirty_log(). + */ +static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) +{ + unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); + + if (dirty_bytes > PAGE_SIZE) + memslot->dirty_bitmap = vzalloc(dirty_bytes); + else + memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); + + if (!memslot->dirty_bitmap) + return -ENOMEM; + + memslot->dirty_bitmap_head = memslot->dirty_bitmap; + return 0; +} + +/* + * Allocate some memory and give it an address in the guest physical address + * space. + * + * Discontiguous memory is allowed, mostly for framebuffers. + * + * Must be called holding mmap_sem for write. + */ +int __kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc) +{ + int r, flush_shadow = 0; + gfn_t base_gfn; + unsigned long npages; + unsigned long i; + struct kvm_memory_slot *memslot; + struct kvm_memory_slot old, new; + struct kvm_memslots *slots, *old_memslots; + + r = -EINVAL; + /* General sanity checks */ + if (mem->memory_size & (PAGE_SIZE - 1)) + goto out; + if (mem->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) + goto out; + if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) + goto out; + if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) + goto out; + + memslot = &kvm->memslots->memslots[mem->slot]; + base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; + npages = mem->memory_size >> PAGE_SHIFT; + + r = -EINVAL; + if (npages > KVM_MEM_MAX_NR_PAGES) + goto out; + + if (!npages) + mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + + new = old = *memslot; + + new.id = mem->slot; + new.base_gfn = base_gfn; + new.npages = npages; + new.flags = mem->flags; + + /* Disallow changing a memory slot's size. */ + r = -EINVAL; + if (npages && old.npages && npages != old.npages) + goto out_free; + + /* Check for overlaps */ + r = -EEXIST; + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; + + if (s == memslot || !s->npages) + continue; + if (!((base_gfn + npages <= s->base_gfn) || + (base_gfn >= s->base_gfn + s->npages))) + goto out_free; + } + + /* Free page dirty bitmap if unneeded */ + if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) + new.dirty_bitmap = NULL; + + r = -ENOMEM; + + /* Allocate if a slot is being created */ +#ifndef CONFIG_S390 + if (npages && !new.rmap) { + new.rmap = vzalloc(npages * sizeof(*new.rmap)); + + if (!new.rmap) + goto out_free; + + new.user_alloc = user_alloc; + new.userspace_addr = mem->userspace_addr; + } + if (!npages) + goto skip_lpage; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + unsigned long j; + int lpages; + int level = i + 2; + + /* Avoid unused variable warning if no large pages */ + (void)level; + + if (new.lpage_info[i]) + continue; + + lpages = 1 + ((base_gfn + npages - 1) + >> KVM_HPAGE_GFN_SHIFT(level)); + lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); + + new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); + + if (!new.lpage_info[i]) + goto out_free; + + if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + new.lpage_info[i][0].write_count = 1; + if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + new.lpage_info[i][lpages - 1].write_count = 1; + ugfn = new.userspace_addr >> PAGE_SHIFT; + /* + * If the gfn and userspace address are not aligned wrt each + * other, or if explicitly asked to, disable large page + * support for this slot + */ + if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !largepages_enabled) + for (j = 0; j < lpages; ++j) + new.lpage_info[i][j].write_count = 1; + } + +skip_lpage: + + /* Allocate page dirty bitmap if needed */ + if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { + if (kvm_create_dirty_bitmap(&new) < 0) + goto out_free; + /* destroy any largepage mappings for dirty tracking */ + if (old.npages) + flush_shadow = 1; + } +#else /* not defined CONFIG_S390 */ + new.user_alloc = user_alloc; + if (user_alloc) + new.userspace_addr = mem->userspace_addr; +#endif /* not defined CONFIG_S390 */ + + if (!npages) { + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + if (mem->slot >= slots->nmemslots) + slots->nmemslots = mem->slot + 1; + slots->generation++; + slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; + + old_memslots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + kvm_synchronize_srcu_expedited(&kvm->srcu); + /* From this point no new shadow pages pointing to a deleted + * memslot will be created. + * + * validation of sp->gfn happens in: + * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) + * - kvm_is_visible_gfn (mmu_check_roots) + */ + kvm_arch_flush_shadow(kvm); + kfree(old_memslots); + } + + r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); + if (r) + goto out_free; + + /* map the pages in iommu page table */ + if (npages) { + r = kvm_iommu_map_pages(kvm, &new); + if (r) + goto out_free; + } + + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + if (mem->slot >= slots->nmemslots) + slots->nmemslots = mem->slot + 1; + slots->generation++; + + /* actual memory is freed via old in kvm_free_physmem_slot below */ + if (!npages) { + new.rmap = NULL; + new.dirty_bitmap = NULL; + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) + new.lpage_info[i] = NULL; + } + + slots->memslots[mem->slot] = new; + old_memslots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + kvm_synchronize_srcu_expedited(&kvm->srcu); + + kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); + + kvm_free_physmem_slot(&old, &new); + kfree(old_memslots); + + if (flush_shadow) + kvm_arch_flush_shadow(kvm); + + return 0; + +out_free: + kvm_free_physmem_slot(&new, &old); +out: + return r; + +} +EXPORT_SYMBOL_GPL(__kvm_set_memory_region); + +int kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc) +{ + int r; + + mutex_lock(&kvm->slots_lock); + r = __kvm_set_memory_region(kvm, mem, user_alloc); + mutex_unlock(&kvm->slots_lock); + return r; +} +EXPORT_SYMBOL_GPL(kvm_set_memory_region); + +int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct + kvm_userspace_memory_region *mem, + int user_alloc) +{ + if (mem->slot >= KVM_MEMORY_SLOTS) + return -EINVAL; + return kvm_set_memory_region(kvm, mem, user_alloc); +} + +int kvm_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log, int *is_dirty) +{ + struct kvm_memory_slot *memslot; + int r, i; + unsigned long n; + unsigned long any = 0; + + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + for (i = 0; !any && i < n/sizeof(long); ++i) + any = memslot->dirty_bitmap[i]; + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + if (any) + *is_dirty = 1; + + r = 0; +out: + return r; +} + +void kvm_disable_largepages(void) +{ + largepages_enabled = false; +} +EXPORT_SYMBOL_GPL(kvm_disable_largepages); + +int is_error_page(struct page *page) +{ + return page == bad_page || page == hwpoison_page || page == fault_page; +} +EXPORT_SYMBOL_GPL(is_error_page); + +int is_error_pfn(pfn_t pfn) +{ + return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; +} +EXPORT_SYMBOL_GPL(is_error_pfn); + +int is_hwpoison_pfn(pfn_t pfn) +{ + return pfn == hwpoison_pfn; +} +EXPORT_SYMBOL_GPL(is_hwpoison_pfn); + +int is_fault_pfn(pfn_t pfn) +{ + return pfn == fault_pfn; +} +EXPORT_SYMBOL_GPL(is_fault_pfn); + +static inline unsigned long bad_hva(void) +{ + return PAGE_OFFSET; +} + +int kvm_is_error_hva(unsigned long addr) +{ + return addr == bad_hva(); +} +EXPORT_SYMBOL_GPL(kvm_is_error_hva); + +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, + gfn_t gfn) +{ + int i; + + for (i = 0; i < slots->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return memslot; + } + return NULL; +} + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_memslot(kvm_memslots(kvm), gfn); +} +EXPORT_SYMBOL_GPL(gfn_to_memslot); + +int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_memslots *slots = kvm_memslots(kvm); + + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; + + if (memslot->flags & KVM_MEMSLOT_INVALID) + continue; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); + +unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +{ + struct vm_area_struct *vma; + unsigned long addr, size; + + size = PAGE_SIZE; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return PAGE_SIZE; + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, addr); + if (!vma) + goto out; + + size = kvm_vma_kernel_pagesize(vma); + +out: + up_read(¤t->mm->mmap_sem); + + return size; +} + +int memslot_id(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *memslot = NULL; + + for (i = 0; i < slots->nmemslots; ++i) { + memslot = &slots->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + break; + } + + return memslot - slots->memslots; +} + +static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, + gfn_t *nr_pages) +{ + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return bad_hva(); + + if (nr_pages) + *nr_pages = slot->npages - (gfn - slot->base_gfn); + + return gfn_to_hva_memslot(slot, gfn); +} + +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) +{ + return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); +} +EXPORT_SYMBOL_GPL(gfn_to_hva); + +static pfn_t get_fault_pfn(void) +{ + get_page(fault_page); + return fault_pfn; +} + +static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, + bool *async, bool write_fault, bool *writable) +{ + struct page *page[1]; + int npages = 0; + pfn_t pfn; + + /* we can do it either atomically or asynchronously, not both */ + BUG_ON(atomic && async); + + BUG_ON(!write_fault && !writable); + + if (writable) + *writable = true; + + if (atomic || async) + npages = kvm___get_user_pages_fast(addr, 1, 1, page); + + if (unlikely(npages != 1) && !atomic) { + might_sleep(); + + if (writable) + *writable = write_fault; + + npages = get_user_pages_fast(addr, 1, write_fault, page); + + /* map read fault as writable if possible */ + if (unlikely(!write_fault) && npages == 1) { + struct page *wpage[1]; + + npages = kvm___get_user_pages_fast(addr, 1, 1, wpage); + if (npages == 1) { + *writable = true; + put_page(page[0]); + page[0] = wpage[0]; + } + npages = 1; + } + } + + if (unlikely(npages != 1)) { + struct vm_area_struct *vma; + + if (atomic) + return get_fault_pfn(); + + down_read(¤t->mm->mmap_sem); + if (is_hwpoison_address(addr)) { + up_read(¤t->mm->mmap_sem); + get_page(hwpoison_page); + return page_to_pfn(hwpoison_page); + } + + vma = find_vma_intersection(current->mm, addr, addr+1); + + if (vma == NULL) + pfn = get_fault_pfn(); + else if ((vma->vm_flags & VM_PFNMAP)) { + pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + BUG_ON(!kvm_is_mmio_pfn(pfn)); + } else { + if (async && (vma->vm_flags & VM_WRITE)) + *async = true; + pfn = get_fault_pfn(); + } + up_read(¤t->mm->mmap_sem); + } else + pfn = page_to_pfn(page[0]); + + return pfn; +} + +pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) +{ + return hva_to_pfn(kvm, addr, true, NULL, true, NULL); +} +EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); + +static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, + bool write_fault, bool *writable) +{ + unsigned long addr; + + if (async) + *async = false; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38) + async = NULL; +#endif + return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); +} + +pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); + +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, + bool write_fault, bool *writable) +{ + return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_async); + +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn); + +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, + bool *writable) +{ + return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); + +pfn_t gfn_to_pfn_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) +{ + unsigned long addr = gfn_to_hva_memslot(slot, gfn); + return hva_to_pfn(kvm, addr, false, NULL, true, NULL); +} + +int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, + int nr_pages) +{ + unsigned long addr; + gfn_t entry; + + addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); + if (kvm_is_error_hva(addr)) + return -1; + + if (entry < nr_pages) + return 0; + + return kvm___get_user_pages_fast(addr, nr_pages, 1, pages); +} +EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); + +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + pfn_t pfn; + + pfn = gfn_to_pfn(kvm, gfn); + if (!kvm_is_mmio_pfn(pfn)) + return pfn_to_page(pfn); + + WARN_ON(kvm_is_mmio_pfn(pfn)); + + get_page(bad_page); + return bad_page; +} + +EXPORT_SYMBOL_GPL(gfn_to_page); + +void kvm_release_page_clean(struct page *page) +{ + kvm_release_pfn_clean(page_to_pfn(page)); +} +EXPORT_SYMBOL_GPL(kvm_release_page_clean); + +void kvm_release_pfn_clean(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) + put_page(pfn_to_page(pfn)); +} +EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); + +void kvm_release_page_dirty(struct page *page) +{ + kvm_release_pfn_dirty(page_to_pfn(page)); +} +EXPORT_SYMBOL_GPL(kvm_release_page_dirty); + +void kvm_release_pfn_dirty(pfn_t pfn) +{ + kvm_set_pfn_dirty(pfn); + kvm_release_pfn_clean(pfn); +} +EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); + +void kvm_set_page_dirty(struct page *page) +{ + kvm_set_pfn_dirty(page_to_pfn(page)); +} +EXPORT_SYMBOL_GPL(kvm_set_page_dirty); + +void kvm_set_pfn_dirty(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) + SetPageDirty(page); + } +} +EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); + +void kvm_set_pfn_accessed(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) + mark_page_accessed(pfn_to_page(pfn)); +} +EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); + +void kvm_get_pfn(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) + get_page(pfn_to_page(pfn)); +} +EXPORT_SYMBOL_GPL(kvm_get_pfn); + +static int next_segment(unsigned long len, int offset) +{ + if (len > PAGE_SIZE - offset) + return PAGE_SIZE - offset; + else + return len; +} + +int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, + int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + r = copy_from_user(data, (void *)addr + offset, len); + if (r) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest_page); + +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest); + +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len) +{ + int r; + unsigned long addr; + gfn_t gfn = gpa >> PAGE_SHIFT; + int offset = offset_in_page(gpa); + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + pagefault_disable(); + r = __copy_from_user_inatomic(data, (void *)addr + offset, len); + pagefault_enable(); + if (r) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL(kvm_read_guest_atomic); + +int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, + int offset, int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + r = copy_to_user((void *)addr + offset, data, len); + if (r) + return -EFAULT; + mark_page_dirty(kvm, gfn); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_page); + +int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, + unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} + +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + int offset = offset_in_page(gpa); + gfn_t gfn = gpa >> PAGE_SHIFT; + + ghc->gpa = gpa; + ghc->generation = slots->generation; + ghc->memslot = __gfn_to_memslot(slots, gfn); + ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); + if (!kvm_is_error_hva(ghc->hva)) + ghc->hva += offset; + else + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); + +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + int r; + + if (slots->generation != ghc->generation) + kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); + + if (kvm_is_error_hva(ghc->hva)) + return -EFAULT; + + r = copy_to_user((void *)ghc->hva, data, len); + if (r) + return -EFAULT; + mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_cached); + +int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) +{ + return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, + offset, len); +} +EXPORT_SYMBOL_GPL(kvm_clear_guest_page); + +int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_clear_guest_page(kvm, gfn, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_clear_guest); + +void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, + gfn_t gfn) +{ + if (memslot && memslot->dirty_bitmap) { + unsigned long rel_gfn = gfn - memslot->base_gfn; + + generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); + } +} + +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + + memslot = gfn_to_memslot(kvm, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); +} + +/* + * The vCPU has executed a HLT instruction with in-kernel mode enabled. + */ +void kvm_vcpu_block(struct kvm_vcpu *vcpu) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + + if (kvm_arch_vcpu_runnable(vcpu)) { + kvm_make_request(KVM_REQ_UNHALT, vcpu); + break; + } + if (kvm_cpu_has_pending_timer(vcpu)) + break; + if (signal_pending(current)) + break; + + schedule(); + } + + finish_wait(&vcpu->wq, &wait); +} + +void kvm_resched(struct kvm_vcpu *vcpu) +{ + if (!need_resched()) + return; + cond_resched(); +} +EXPORT_SYMBOL_GPL(kvm_resched); + +void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) +{ + ktime_t expires; + DEFINE_WAIT(wait); + + prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + + /* Sleep for 100 us, and hope lock-holder got scheduled */ + expires = ktime_add_ns(ktime_get(), 100000UL); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + + finish_wait(&vcpu->wq, &wait); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); + +static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvm_vcpu *vcpu = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff == 0) + page = virt_to_page(vcpu->run); +#ifdef CONFIG_X86 + else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) + page = virt_to_page(vcpu->arch.pio_data); +#endif +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) + page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); +#endif + else + return VM_FAULT_SIGBUS; + get_page(page); + vmf->page = page; + return 0; +} + +static struct vm_operations_struct kvm_vcpu_vm_ops = { + .fault = kvm_vcpu_fault, +}; + +static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vcpu_vm_ops; + return 0; +} + +static int kvm_vcpu_release(struct inode *inode, struct file *filp) +{ + struct kvm_vcpu *vcpu = filp->private_data; + + kvm_put_kvm(vcpu->kvm); + return 0; +} + +static struct file_operations kvm_vcpu_fops = { + .release = kvm_vcpu_release, + .unlocked_ioctl = kvm_vcpu_ioctl, + .compat_ioctl = kvm_vcpu_ioctl, + .mmap = kvm_vcpu_mmap, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) + .llseek = noop_llseek, +#endif +}; + +/* + * Allocates an inode for the vcpu. + */ +static int create_vcpu_fd(struct kvm_vcpu *vcpu) +{ + return kvm_anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); +} + +/* + * Creates some virtual cpus. Good luck creating more than one. + */ +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) +{ + int r; + struct kvm_vcpu *vcpu, *v; + + vcpu = kvm_arch_vcpu_create(kvm, id); + if (IS_ERR(vcpu)) + return PTR_ERR(vcpu); + + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + + r = kvm_arch_vcpu_setup(vcpu); + if (r) + return r; + + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { + r = -EINVAL; + goto vcpu_destroy; + } + + kvm_for_each_vcpu(r, v, kvm) + if (v->vcpu_id == id) { + r = -EEXIST; + goto vcpu_destroy; + } + + BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); + + /* Now it's all set up, let userspace reach it */ + kvm_get_kvm(kvm); + r = create_vcpu_fd(vcpu); + if (r < 0) { + kvm_put_kvm(kvm); + goto vcpu_destroy; + } + + kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; + smp_wmb(); + atomic_inc(&kvm->online_vcpus); + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + if (kvm->bsp_vcpu_id == id) + kvm->bsp_vcpu = vcpu; +#endif + mutex_unlock(&kvm->lock); + return r; + +vcpu_destroy: + mutex_unlock(&kvm->lock); + kvm_arch_vcpu_destroy(vcpu); + return r; +} + +static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) +{ + if (sigset) { + sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); + vcpu->sigset_active = 1; + vcpu->sigset = *sigset; + } else + vcpu->sigset_active = 0; + return 0; +} + +static long kvm_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void *argp = (void *)arg; + int r; + struct kvm_fpu *fpu = NULL; + struct kvm_sregs *kvm_sregs = NULL; + + if (vcpu->kvm->mm != current->mm) + return -EIO; + +#if defined(CONFIG_S390) || defined(CONFIG_PPC) + /* + * Special cases: vcpu ioctls that are asynchronous to vcpu execution, + * so vcpu_load() would break it. + */ + if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) + return kvm_arch_vcpu_ioctl(filp, ioctl, arg); +#endif + + + vcpu_load(vcpu); + switch (ioctl) { + case KVM_RUN: + r = -EINVAL; + if (arg) + goto out; + r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); + break; + case KVM_GET_REGS: { + struct kvm_regs *kvm_regs; + + r = -ENOMEM; + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); + if (!kvm_regs) + goto out; + r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); + if (r) + goto out_free1; + r = -EFAULT; + if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) + goto out_free1; + r = 0; +out_free1: + kfree(kvm_regs); + break; + } + case KVM_SET_REGS: { + struct kvm_regs *kvm_regs; + + r = -ENOMEM; + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); + if (!kvm_regs) + goto out; + r = -EFAULT; + if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) + goto out_free2; + r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); + if (r) + goto out_free2; + r = 0; +out_free2: + kfree(kvm_regs); + break; + } + case KVM_GET_SREGS: { + kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + r = -ENOMEM; + if (!kvm_sregs) + goto out; + r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) + goto out; + r = 0; + break; + } + case KVM_SET_SREGS: { + kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + r = -ENOMEM; + if (!kvm_sregs) + goto out; + r = -EFAULT; + if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) + goto out; + r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_MP_STATE: { + struct kvm_mp_state mp_state; + + r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &mp_state, sizeof mp_state)) + goto out; + r = 0; + break; + } + case KVM_SET_MP_STATE: { + struct kvm_mp_state mp_state; + + r = -EFAULT; + if (copy_from_user(&mp_state, argp, sizeof mp_state)) + goto out; + r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); + if (r) + goto out; + r = 0; + break; + } + case KVM_TRANSLATE: { + struct kvm_translation tr; + + r = -EFAULT; + if (copy_from_user(&tr, argp, sizeof tr)) + goto out; + r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &tr, sizeof tr)) + goto out; + r = 0; + break; + } + case KVM_SET_GUEST_DEBUG: { + struct kvm_guest_debug dbg; + + r = -EFAULT; + if (copy_from_user(&dbg, argp, sizeof dbg)) + goto out; + r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); + if (r) + goto out; + r = 0; + break; + } + case KVM_SET_SIGNAL_MASK: { + struct kvm_signal_mask *sigmask_arg = argp; + struct kvm_signal_mask kvm_sigmask; + sigset_t sigset, *p; + + p = NULL; + if (argp) { + r = -EFAULT; + if (copy_from_user(&kvm_sigmask, argp, + sizeof kvm_sigmask)) + goto out; + r = -EINVAL; + if (kvm_sigmask.len != sizeof sigset) + goto out; + r = -EFAULT; + if (copy_from_user(&sigset, sigmask_arg->sigset, + sizeof sigset)) + goto out; + p = &sigset; + } + r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); + break; + } + case KVM_GET_FPU: { + fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + r = -ENOMEM; + if (!fpu) + goto out; + r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) + goto out; + r = 0; + break; + } + case KVM_SET_FPU: { + fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + r = -ENOMEM; + if (!fpu) + goto out; + r = -EFAULT; + if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) + goto out; + r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); + if (r) + goto out; + r = 0; + break; + } + default: + r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); + } +out: + vcpu_put(vcpu); + kfree(fpu); + kfree(kvm_sregs); + return r; +} + +static long kvm_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void *argp = (void *)arg; + int r; + + if (kvm->mm != current->mm) + return -EIO; + switch (ioctl) { + case KVM_CREATE_VCPU: + r = kvm_vm_ioctl_create_vcpu(kvm, arg); + if (r < 0) + goto out; + break; + case KVM_SET_USER_MEMORY_REGION: { + struct kvm_userspace_memory_region kvm_userspace_mem; + + r = -EFAULT; + if (copy_from_user(&kvm_userspace_mem, argp, + sizeof kvm_userspace_mem)) + goto out; + + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); + if (r) + goto out; + break; + } + case KVM_GET_DIRTY_LOG: { + struct kvm_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof log)) + goto out; + r = kvm_vm_ioctl_get_dirty_log(kvm, &log); + if (r) + goto out; + break; + } +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + case KVM_REGISTER_COALESCED_MMIO: { + struct kvm_coalesced_mmio_zone zone; + r = -EFAULT; + if (copy_from_user(&zone, argp, sizeof zone)) + goto out; + r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); + if (r) + goto out; + r = 0; + break; + } + case KVM_UNREGISTER_COALESCED_MMIO: { + struct kvm_coalesced_mmio_zone zone; + r = -EFAULT; + if (copy_from_user(&zone, argp, sizeof zone)) + goto out; + r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); + if (r) + goto out; + r = 0; + break; + } +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33) + case KVM_IRQFD: { + struct kvm_irqfd data; + + r = -EFAULT; + if (copy_from_user(&data, argp, sizeof data)) + goto out; + r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); + break; + } +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33) + case KVM_IOEVENTFD: { + struct kvm_ioeventfd data; + + r = -EFAULT; + if (copy_from_user(&data, argp, sizeof data)) + goto out; + r = kvm_ioeventfd(kvm, &data); + break; + } +#endif +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_SET_BOOT_CPU_ID: + r = 0; + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus) != 0) + r = -EBUSY; + else + kvm->bsp_vcpu_id = arg; + mutex_unlock(&kvm->lock); + break; +#endif + default: + r = kvm_arch_vm_ioctl(filp, ioctl, arg); + if (r == -ENOTTY) + r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); + } +out: + return r; +} + +#ifdef CONFIG_COMPAT +struct compat_kvm_dirty_log { + __u32 slot; + __u32 padding1; + union { + compat_uptr_t dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + +static long kvm_vm_compat_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + int r; + + if (kvm->mm != current->mm) + return -EIO; + switch (ioctl) { + case KVM_GET_DIRTY_LOG: { + struct compat_kvm_dirty_log compat_log; + struct kvm_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&compat_log, (void *)arg, + sizeof(compat_log))) + goto out; + log.slot = compat_log.slot; + log.padding1 = compat_log.padding1; + log.padding2 = compat_log.padding2; + log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); + + r = kvm_vm_ioctl_get_dirty_log(kvm, &log); + if (r) + goto out; + break; + } + default: + r = kvm_vm_ioctl(filp, ioctl, arg); + } + +out: + return r; +} +#endif + +static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page[1]; + unsigned long addr; + int npages; + gfn_t gfn = vmf->pgoff; + struct kvm *kvm = vma->vm_file->private_data; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return VM_FAULT_SIGBUS; + + npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, + NULL); + if (unlikely(npages != 1)) + return VM_FAULT_SIGBUS; + + vmf->page = page[0]; + return 0; +} + +static struct vm_operations_struct kvm_vm_vm_ops = { + .fault = kvm_vm_fault, +}; + +static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vm_vm_ops; + return 0; +} + +static struct file_operations kvm_vm_fops = { + .release = kvm_vm_release, + .unlocked_ioctl = kvm_vm_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = kvm_vm_compat_ioctl, +#endif + .mmap = kvm_vm_mmap, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) + .llseek = noop_llseek, +#endif +}; + +static int kvm_dev_ioctl_create_vm(void) +{ + int r; + struct kvm *kvm; + + kvm = kvm_create_vm(); + if (IS_ERR(kvm)) + return PTR_ERR(kvm); +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + r = kvm_coalesced_mmio_init(kvm); + if (r < 0) { + kvm_put_kvm(kvm); + return r; + } +#endif + r = kvm_anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); + if (r < 0) + kvm_put_kvm(kvm); + + return r; +} + +static long kvm_dev_ioctl_check_extension_generic(long arg) +{ + switch (arg) { + case KVM_CAP_USER_MEMORY: + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: + case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_CAP_SET_BOOT_CPU_ID: +#endif + case KVM_CAP_INTERNAL_ERROR_DATA: + return 1; +#ifdef CONFIG_HAVE_KVM_IRQCHIP + case KVM_CAP_IRQ_ROUTING: + return KVM_MAX_IRQ_ROUTES; +#endif + default: + break; + } + return kvm_dev_ioctl_check_extension(arg); +} + +static long kvm_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + long r = -EINVAL; + + switch (ioctl) { + case KVM_GET_API_VERSION: + r = -EINVAL; + if (arg) + goto out; + r = KVM_API_VERSION; + break; + case KVM_CREATE_VM: + r = -EINVAL; + if (arg) + goto out; + r = kvm_dev_ioctl_create_vm(); + break; + case KVM_CHECK_EXTENSION: + r = kvm_dev_ioctl_check_extension_generic(arg); + break; + case KVM_GET_VCPU_MMAP_SIZE: + r = -EINVAL; + if (arg) + goto out; + r = PAGE_SIZE; /* struct kvm_run */ +#ifdef CONFIG_X86 + r += PAGE_SIZE; /* pio data page */ +#endif +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + r += PAGE_SIZE; /* coalesced mmio ring page */ +#endif + break; + case KVM_TRACE_ENABLE: + case KVM_TRACE_PAUSE: + case KVM_TRACE_DISABLE: + r = -EOPNOTSUPP; + break; + default: + return kvm_arch_dev_ioctl(filp, ioctl, arg); + } +out: + return r; +} + +static struct file_operations kvm_chardev_ops = { + .unlocked_ioctl = kvm_dev_ioctl, + .compat_ioctl = kvm_dev_ioctl, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) + .llseek = noop_llseek, +#endif +}; + +static struct miscdevice kvm_dev = { + KVM_MINOR, + "kvm", + &kvm_chardev_ops, +}; + +static void hardware_enable_nolock(void *junk) +{ + int cpu = raw_smp_processor_id(); + int r; + + if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) + return; + + cpumask_set_cpu(cpu, cpus_hardware_enabled); + + r = kvm_arch_hardware_enable(NULL); + + if (r) { + cpumask_clear_cpu(cpu, cpus_hardware_enabled); + atomic_inc(&hardware_enable_failed); + printk(KERN_INFO "kvm: enabling virtualization on " + "CPU%d failed\n", cpu); + } +} + +static void hardware_enable(void *junk) +{ + spin_lock(&kvm_lock); + hardware_enable_nolock(junk); + spin_unlock(&kvm_lock); +} + +static void hardware_disable_nolock(void *junk) +{ + int cpu = raw_smp_processor_id(); + + if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) + return; + cpumask_clear_cpu(cpu, cpus_hardware_enabled); + kvm_arch_hardware_disable(NULL); +} + +static void hardware_disable(void *junk) +{ + spin_lock(&kvm_lock); + hardware_disable_nolock(junk); + spin_unlock(&kvm_lock); +} + +static void hardware_disable_all_nolock(void) +{ + BUG_ON(!kvm_usage_count); + + kvm_usage_count--; + if (!kvm_usage_count) + kvm_on_each_cpu(hardware_disable_nolock, NULL, 1); +} + +static void hardware_disable_all(void) +{ + spin_lock(&kvm_lock); + hardware_disable_all_nolock(); + spin_unlock(&kvm_lock); +} + +static int hardware_enable_all(void) +{ + int r = 0; + + spin_lock(&kvm_lock); + + kvm_usage_count++; + if (kvm_usage_count == 1) { + atomic_set(&hardware_enable_failed, 0); + kvm_on_each_cpu(hardware_enable_nolock, NULL, 1); + + if (atomic_read(&hardware_enable_failed)) { + hardware_disable_all_nolock(); + r = -EBUSY; + } + } + + spin_unlock(&kvm_lock); + + return r; +} + +static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, + void *v) +{ + int cpu = (long)v; + + if (!kvm_usage_count) + return NOTIFY_OK; + + val &= ~CPU_TASKS_FROZEN; + switch (val) { + case CPU_DYING: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); + hardware_disable(NULL); + break; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28) + case CPU_STARTING: +#else + case CPU_ONLINE: +#endif + printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", + cpu); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28) + hardware_enable(NULL); +#else + smp_call_function_single(cpu, hardware_enable, NULL, 1); +#endif + break; + } + return NOTIFY_OK; +} + + +asmlinkage void kvm_spurious_fault(void) +{ + /* Fault while not rebooting. We want the trace. */ + BUG(); +} +EXPORT_SYMBOL_GPL(kvm_spurious_fault); + +static int kvm_reboot(struct notifier_block *notifier, unsigned long val, + void *v) +{ + /* + * Some (well, at least mine) BIOSes hang on reboot if + * in vmx root mode. + * + * And Intel TXT required VMX off for all cpu when system shutdown. + */ + printk(KERN_INFO "kvm: exiting hardware virtualization\n"); + kvm_rebooting = true; + kvm_on_each_cpu(hardware_disable_nolock, NULL, 1); + return NOTIFY_OK; +} + +static struct notifier_block kvm_reboot_notifier = { + .notifier_call = kvm_reboot, + .priority = 0, +}; + +static void kvm_io_bus_destroy(struct kvm_io_bus *bus) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + kvm_iodevice_destructor(pos); + } + kfree(bus); +} + +/* kvm_io_bus_write - called under kvm->slots_lock */ +int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, const void *val) +{ + int i; + struct kvm_io_bus *bus; + + bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + for (i = 0; i < bus->dev_count; i++) + if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) + return 0; + return -EOPNOTSUPP; +} + +/* kvm_io_bus_read - called under kvm->slots_lock */ +int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, void *val) +{ + int i; + struct kvm_io_bus *bus; + + bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + for (i = 0; i < bus->dev_count; i++) + if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) + return 0; + return -EOPNOTSUPP; +} + +/* Caller must hold slots_lock. */ +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) +{ + struct kvm_io_bus *new_bus, *bus; + + bus = kvm->buses[bus_idx]; + if (bus->dev_count > NR_IOBUS_DEVS-1) + return -ENOSPC; + + new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + new_bus->devs[new_bus->dev_count++] = dev; + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + kvm_synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); + + return 0; +} + +/* Caller must hold slots_lock. */ +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) +{ + int i, r; + struct kvm_io_bus *new_bus, *bus; + + new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; + + bus = kvm->buses[bus_idx]; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + + r = -ENOENT; + for (i = 0; i < new_bus->dev_count; i++) + if (new_bus->devs[i] == dev) { + r = 0; + new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; + break; + } + + if (r) { + kfree(new_bus); + return r; + } + + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + kvm_synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); + return r; +} + +static struct notifier_block kvm_cpu_notifier = { + .notifier_call = kvm_cpu_hotplug, +}; + +static int __vm_stat_get(void *_offset, u64 *val) +{ + unsigned offset = (long)_offset; + struct kvm *kvm; + + *val = 0; + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + *val += *(u32 *)((void *)kvm + offset); + spin_unlock(&kvm_lock); + return 0; +} + +MAKE_SIMPLE_ATTRIBUTE_GETTER(vm_stat_get) +DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); + +static int __vcpu_stat_get(void *_offset, u64 *val) +{ + unsigned offset = (long)_offset; + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; + + *val = 0; + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + *val += *(u32 *)((void *)vcpu + offset); + + spin_unlock(&kvm_lock); + return 0; +} + +MAKE_SIMPLE_ATTRIBUTE_GETTER(vcpu_stat_get) +DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); + +static const struct file_operations *stat_fops[] = { + [KVM_STAT_VCPU] = &vcpu_stat_fops, + [KVM_STAT_VM] = &vm_stat_fops, +}; + +static void kvm_init_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); + for (p = debugfs_entries; p->name; ++p) + p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, + (void *)(long)p->offset, + stat_fops[p->kind]); +} + +static void kvm_exit_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + for (p = debugfs_entries; p->name; ++p) + debugfs_remove(p->dentry); + debugfs_remove(kvm_debugfs_dir); +} + +static int kvm_suspend(struct sys_device *dev, pm_message_t state) +{ + if (kvm_usage_count) + hardware_disable_nolock(NULL); + return 0; +} + +static int kvm_resume(struct sys_device *dev) +{ + if (kvm_usage_count) { + WARN_ON(spin_is_locked(&kvm_lock)); + hardware_enable_nolock(NULL); + } + return 0; +} + +static struct sysdev_class kvm_sysdev_class = { + set_kset_name("kvm"), + .suspend = kvm_suspend, + .resume = kvm_resume, +}; + +static struct sys_device kvm_sysdev = { + .id = 0, + .cls = &kvm_sysdev_class, +}; + +struct page *bad_page; +pfn_t bad_pfn; + +static inline +struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) +{ + return container_of(pn, struct kvm_vcpu, preempt_notifier); +} + +static void kvm_sched_in(struct preempt_notifier *pn, int cpu) +{ + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + + kvm_arch_vcpu_load(vcpu, cpu); +} + +static void kvm_sched_out(struct preempt_notifier *pn, + struct task_struct *next) +{ + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + + kvm_arch_vcpu_put(vcpu); + kvm_fire_urn(); +} + +int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, + struct module *module) +{ + int r; + int cpu; + + r = kvm_init_srcu(); + if (r) + return r; + + preempt_notifier_sys_init(); + + r = kvm_arch_init(opaque); + if (r) + goto out_fail; + + bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (bad_page == NULL) { + r = -ENOMEM; + goto out; + } + + bad_pfn = page_to_pfn(bad_page); + + hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (hwpoison_page == NULL) { + r = -ENOMEM; + goto out_free_0; + } + + hwpoison_pfn = page_to_pfn(hwpoison_page); + + fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (fault_page == NULL) { + r = -ENOMEM; + goto out_free_0; + } + + fault_pfn = page_to_pfn(fault_page); + + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { + r = -ENOMEM; + goto out_free_0; + } + + r = kvm_arch_hardware_setup(); + if (r < 0) + goto out_free_0a; + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, + kvm_arch_check_processor_compat, + &r, 1); + if (r < 0) + goto out_free_1; + } + + r = register_cpu_notifier(&kvm_cpu_notifier); + if (r) + goto out_free_2; + register_reboot_notifier(&kvm_reboot_notifier); + + r = sysdev_class_register(&kvm_sysdev_class); + if (r) + goto out_free_3; + + r = sysdev_register(&kvm_sysdev); + if (r) + goto out_free_4; + + /* A kmem cache lets us meet the alignment requirements of fx_save. */ + if (!vcpu_align) + vcpu_align = __alignof__(struct kvm_vcpu); + kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, + 0, NULL); + if (!kvm_vcpu_cache) { + r = -ENOMEM; + goto out_free_5; + } + + r = kvm_async_pf_init(); + if (r) + goto out_free; + + kvm_chardev_ops.owner = module; +IF_ANON_INODES_DOES_REFCOUNTS( kvm_vm_fops.owner = module;) +IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;) + + r = misc_register(&kvm_dev); + if (r) { + printk(KERN_ERR "kvm: misc device register failed\n"); + goto out_unreg; + } + + kvm_preempt_ops.sched_in = kvm_sched_in; + kvm_preempt_ops.sched_out = kvm_sched_out; + + kvm_init_debug(); + + printk("loaded kvm module (kvm-kmod-2.6.38-rc7)\n"); + + kvm_clock_warn_suspend_bug(); + + return 0; + +out_unreg: + kvm_async_pf_deinit(); +out_free: + kmem_cache_destroy(kvm_vcpu_cache); +out_free_5: + sysdev_unregister(&kvm_sysdev); +out_free_4: + sysdev_class_unregister(&kvm_sysdev_class); +out_free_3: + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); +out_free_2: +out_free_1: + kvm_arch_hardware_unsetup(); +out_free_0a: + free_cpumask_var(cpus_hardware_enabled); +out_free_0: + if (fault_page) + __free_page(fault_page); + if (hwpoison_page) + __free_page(hwpoison_page); + __free_page(bad_page); +out: + kvm_arch_exit(); +out_fail: + preempt_notifier_sys_exit(); + kvm_exit_srcu(); + return r; +} +EXPORT_SYMBOL_GPL(kvm_init); + +void kvm_exit(void) +{ + kvm_exit_debug(); + misc_deregister(&kvm_dev); + kmem_cache_destroy(kvm_vcpu_cache); + kvm_async_pf_deinit(); + sysdev_unregister(&kvm_sysdev); + sysdev_class_unregister(&kvm_sysdev_class); + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); + kvm_on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_arch_hardware_unsetup(); + kvm_arch_exit(); + free_cpumask_var(cpus_hardware_enabled); + __free_page(hwpoison_page); + __free_page(bad_page); + preempt_notifier_sys_exit(); + kvm_exit_srcu(); +} +EXPORT_SYMBOL_GPL(kvm_exit); diff --git a/linux/x86/kvm_timer.h b/linux/x86/kvm_timer.h new file mode 100644 index 0000000..64bc6ea --- /dev/null +++ b/linux/x86/kvm_timer.h @@ -0,0 +1,16 @@ + +struct kvm_timer { + struct hrtimer timer; + s64 period; /* unit: ns */ + atomic_t pending; /* accumulated triggered timers */ + bool reinject; + struct kvm_timer_ops *t_ops; + struct kvm *kvm; + struct kvm_vcpu *vcpu; +}; + +struct kvm_timer_ops { + bool (*is_periodic)(struct kvm_timer *); +}; + +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); diff --git a/linux/x86/lapic.c b/linux/x86/lapic.c new file mode 100644 index 0000000..d3ddaa4 --- /dev/null +++ b/linux/x86/lapic.c @@ -0,0 +1,1328 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif + +/* + * Local APIC virtualization + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2007 Novell + * Copyright (C) 2007 Intel + * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Dor Laor <dor.laor@qumranet.com> + * Gregory Haskins <ghaskins@novell.com> + * Yaozu (Eddie) Dong <eddie.dong@intel.com> + * + * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/smp.h> +#include <linux/hrtimer.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/math64.h> +#include <linux/slab.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/page.h> +#include <asm/current.h> +#include <asm/apicdef.h> +#include <asm/atomic.h> +#include "kvm_cache_regs.h" +#include "irq.h" +#include "trace.h" +#include "x86.h" + +#ifndef CONFIG_X86_64 +#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) +#else +#define mod_64(x, y) ((x) % (y)) +#endif + +#define PRId64 "d" +#define PRIx64 "llx" +#define PRIu64 "u" +#define PRIo64 "o" + +#define APIC_BUS_CYCLE_NS 1 + +/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ +#define apic_debug(fmt, arg...) + +#define APIC_LVT_NUM 6 +/* 14 is the version for Xeon and Pentium 8.4.8*/ +#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) +#define LAPIC_MMIO_LENGTH (1 << 12) +/* followed define is not in apicdef.h */ +#define APIC_SHORT_MASK 0xc0000 +#define APIC_DEST_NOSHORT 0x0 +#define APIC_DEST_MASK 0x800 +#define MAX_APIC_VECTOR 256 + +#define VEC_POS(v) ((v) & (32 - 1)) +#define REG_POS(v) (((v) >> 5) << 4) + +static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) +{ + return *((u32 *) (apic->regs + reg_off)); +} + +static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) +{ + *((u32 *) (apic->regs + reg_off)) = val; +} + +static inline int apic_test_and_set_vector(int vec, void *bitmap) +{ + return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int apic_test_and_clear_vector(int vec, void *bitmap) +{ + return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline void apic_set_vector(int vec, void *bitmap) +{ + set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline void apic_clear_vector(int vec, void *bitmap) +{ + clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int apic_hw_enabled(struct kvm_lapic *apic) +{ + return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; +} + +static inline int apic_sw_enabled(struct kvm_lapic *apic) +{ + return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; +} + +static inline int apic_enabled(struct kvm_lapic *apic) +{ + return apic_sw_enabled(apic) && apic_hw_enabled(apic); +} + +#define LVT_MASK \ + (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) + +#define LINT_MASK \ + (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ + APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) + +static inline int kvm_apic_id(struct kvm_lapic *apic) +{ + return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; +} + +static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) +{ + return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); +} + +static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) +{ + return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; +} + +static inline int apic_lvtt_period(struct kvm_lapic *apic) +{ + return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; +} + +static inline int apic_lvt_nmi_mode(u32 lvt_val) +{ + return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; +} + +void kvm_apic_set_version(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_cpuid_entry2 *feat; + u32 v = APIC_VERSION; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); + if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) + v |= APIC_LVR_DIRECTED_EOI; + apic_set_reg(apic, APIC_LVR, v); +} + +static inline int apic_x2apic_mode(struct kvm_lapic *apic) +{ + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; +} + +static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { + LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ + LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ + LVT_MASK | APIC_MODE_MASK, /* LVTPC */ + LINT_MASK, LINT_MASK, /* LVT0-1 */ + LVT_MASK /* LVTERR */ +}; + +static int find_highest_vector(void *bitmap) +{ + u32 *word = bitmap; + int word_offset = MAX_APIC_VECTOR >> 5; + + while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) + continue; + + if (likely(!word_offset && !word[0])) + return -1; + else + return fls(word[word_offset << 2]) - 1 + (word_offset << 5); +} + +static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) +{ + apic->irr_pending = true; + return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); +} + +static inline int apic_search_irr(struct kvm_lapic *apic) +{ + return find_highest_vector(apic->regs + APIC_IRR); +} + +static inline int apic_find_highest_irr(struct kvm_lapic *apic) +{ + int result; + + if (!apic->irr_pending) + return -1; + + result = apic_search_irr(apic); + ASSERT(result == -1 || result >= 16); + + return result; +} + +static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) +{ + apic->irr_pending = false; + apic_clear_vector(vec, apic->regs + APIC_IRR); + if (apic_search_irr(apic) != -1) + apic->irr_pending = true; +} + +int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int highest_irr; + + /* This may race with setting of irr in __apic_accept_irq() and + * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq + * will cause vmexit immediately and the value will be recalculated + * on the next vmentry. + */ + if (!apic) + return 0; + highest_irr = apic_find_highest_irr(apic); + + return highest_irr; +} + +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode); + +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, + irq->level, irq->trig_mode); +} + +static inline int apic_find_highest_isr(struct kvm_lapic *apic) +{ + int result; + + result = find_highest_vector(apic->regs + APIC_ISR); + ASSERT(result == -1 || result >= 16); + + return result; +} + +static void apic_update_ppr(struct kvm_lapic *apic) +{ + u32 tpr, isrv, ppr, old_ppr; + int isr; + + old_ppr = apic_get_reg(apic, APIC_PROCPRI); + tpr = apic_get_reg(apic, APIC_TASKPRI); + isr = apic_find_highest_isr(apic); + isrv = (isr != -1) ? isr : 0; + + if ((tpr & 0xf0) >= (isrv & 0xf0)) + ppr = tpr & 0xff; + else + ppr = isrv & 0xf0; + + apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", + apic, ppr, isr, isrv); + + if (old_ppr != ppr) { + apic_set_reg(apic, APIC_PROCPRI, ppr); + if (ppr < old_ppr) + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + } +} + +static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) +{ + apic_set_reg(apic, APIC_TASKPRI, tpr); + apic_update_ppr(apic); +} + +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) +{ + return dest == 0xff || kvm_apic_id(apic) == dest; +} + +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) +{ + int result = 0; + u32 logical_id; + + if (apic_x2apic_mode(apic)) { + logical_id = apic_get_reg(apic, APIC_LDR); + return logical_id & mda; + } + + logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); + + switch (apic_get_reg(apic, APIC_DFR)) { + case APIC_DFR_FLAT: + if (logical_id & mda) + result = 1; + break; + case APIC_DFR_CLUSTER: + if (((logical_id >> 4) == (mda >> 0x4)) + && (logical_id & mda & 0xf)) + result = 1; + break; + default: + printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", + apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); + break; + } + + return result; +} + +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, int dest, int dest_mode) +{ + int result = 0; + struct kvm_lapic *target = vcpu->arch.apic; + + apic_debug("target %p, source %p, dest 0x%x, " + "dest_mode 0x%x, short_hand 0x%x\n", + target, source, dest, dest_mode, short_hand); + + ASSERT(target); + switch (short_hand) { + case APIC_DEST_NOSHORT: + if (dest_mode == 0) + /* Physical mode. */ + result = kvm_apic_match_physical_addr(target, dest); + else + /* Logical mode. */ + result = kvm_apic_match_logical_addr(target, dest); + break; + case APIC_DEST_SELF: + result = (target == source); + break; + case APIC_DEST_ALLINC: + result = 1; + break; + case APIC_DEST_ALLBUT: + result = (target != source); + break; + default: + printk(KERN_WARNING "Bad dest shorthand value %x\n", + short_hand); + break; + } + + return result; +} + +/* + * Add a pending IRQ into lapic. + * Return 1 if successfully added and 0 if discarded. + */ +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode) +{ + int result = 0; + struct kvm_vcpu *vcpu = apic->vcpu; + + switch (delivery_mode) { + case APIC_DM_LOWEST: + vcpu->arch.apic_arb_prio++; + case APIC_DM_FIXED: + /* FIXME add logic for vcpu on reset */ + if (unlikely(!apic_enabled(apic))) + break; + + if (trig_mode) { + apic_debug("level trig mode for vector %d", vector); + apic_set_vector(vector, apic->regs + APIC_TMR); + } else + apic_clear_vector(vector, apic->regs + APIC_TMR); + + result = !apic_test_and_set_irr(vector, apic); + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector, !result); + if (!result) { + if (trig_mode) + apic_debug("level trig mode repeatedly for " + "vector %d", vector); + break; + } + + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + break; + + case APIC_DM_REMRD: + printk(KERN_DEBUG "Ignoring delivery mode 3\n"); + break; + + case APIC_DM_SMI: + printk(KERN_DEBUG "Ignoring guest SMI\n"); + break; + + case APIC_DM_NMI: + result = 1; + kvm_inject_nmi(vcpu); + kvm_vcpu_kick(vcpu); + break; + + case APIC_DM_INIT: + if (level) { + result = 1; + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) + printk(KERN_DEBUG + "INIT on a runnable vcpu %d\n", + vcpu->vcpu_id); + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } else { + apic_debug("Ignoring de-assert INIT to vcpu %d\n", + vcpu->vcpu_id); + } + break; + + case APIC_DM_STARTUP: + apic_debug("SIPI to vcpu %d vector 0x%02x\n", + vcpu->vcpu_id, vector); + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + result = 1; + vcpu->arch.sipi_vector = vector; + vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } + break; + + case APIC_DM_EXTINT: + /* + * Should only be called by kvm_apic_local_deliver() with LVT0, + * before NMI watchdog was enabled. Already handled by + * kvm_apic_accept_pic_intr(). + */ + break; + + default: + printk(KERN_ERR "TODO: unsupported delivery mode %x\n", + delivery_mode); + break; + } + return result; +} + +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) +{ + return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; +} + +static void apic_set_eoi(struct kvm_lapic *apic) +{ + int vector = apic_find_highest_isr(apic); + int trigger_mode; + /* + * Not every write EOI will has corresponding ISR, + * one example is when Kernel check timer on setup_IO_APIC + */ + if (vector == -1) + return; + + apic_clear_vector(vector, apic->regs + APIC_ISR); + apic_update_ppr(apic); + + if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); +} + +static void apic_send_ipi(struct kvm_lapic *apic) +{ + u32 icr_low = apic_get_reg(apic, APIC_ICR); + u32 icr_high = apic_get_reg(apic, APIC_ICR2); + struct kvm_lapic_irq irq; + + irq.vector = icr_low & APIC_VECTOR_MASK; + irq.delivery_mode = icr_low & APIC_MODE_MASK; + irq.dest_mode = icr_low & APIC_DEST_MASK; + irq.level = icr_low & APIC_INT_ASSERT; + irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; + irq.shorthand = icr_low & APIC_SHORT_MASK; + if (apic_x2apic_mode(apic)) + irq.dest_id = icr_high; + else + irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + + trace_kvm_apic_ipi(icr_low, irq.dest_id); + + apic_debug("icr_high 0x%x, icr_low 0x%x, " + "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " + "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", + icr_high, icr_low, irq.shorthand, irq.dest_id, + irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, + irq.vector); + + kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); +} + +static u32 apic_get_tmcct(struct kvm_lapic *apic) +{ + ktime_t remaining; + s64 ns; + u32 tmcct; + + ASSERT(apic != NULL); + + /* if initial count is 0, current count should also be 0 */ + if (apic_get_reg(apic, APIC_TMICT) == 0) + return 0; + + remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); + if (ktime_to_ns(remaining) < 0) + remaining = ktime_set(0, 0); + + ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); + tmcct = div64_u64(ns, + (APIC_BUS_CYCLE_NS * apic->divide_count)); + + return tmcct; +} + +static void __report_tpr_access(struct kvm_lapic *apic, bool write) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + struct kvm_run *run = vcpu->run; + + kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); + run->tpr_access.rip = kvm_rip_read(vcpu); + run->tpr_access.is_write = write; +} + +static inline void report_tpr_access(struct kvm_lapic *apic, bool write) +{ + if (apic->vcpu->arch.tpr_access_reporting) + __report_tpr_access(apic, write); +} + +static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) +{ + u32 val = 0; + + if (offset >= LAPIC_MMIO_LENGTH) + return 0; + + switch (offset) { + case APIC_ID: + if (apic_x2apic_mode(apic)) + val = kvm_apic_id(apic); + else + val = kvm_apic_id(apic) << 24; + break; + case APIC_ARBPRI: + printk(KERN_WARNING "Access APIC ARBPRI register " + "which is for P6\n"); + break; + + case APIC_TMCCT: /* Timer CCR */ + val = apic_get_tmcct(apic); + break; + + case APIC_TASKPRI: + report_tpr_access(apic, false); + /* fall thru */ + default: + apic_update_ppr(apic); + val = apic_get_reg(apic, offset); + break; + } + + return val; +} + +static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_lapic, dev); +} + +static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, + void *data) +{ + unsigned char alignment = offset & 0xf; + u32 result; + /* this bitmask has a bit cleared for each reserver register */ + static const u64 rmask = 0x43ff01ffffffe70cULL; + + if ((alignment + len) > 4) { + apic_debug("KVM_APIC_READ: alignment error %x %d\n", + offset, len); + return 1; + } + + if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { + apic_debug("KVM_APIC_READ: read reserved register %x\n", + offset); + return 1; + } + + result = __apic_read(apic, offset & ~0xf); + + trace_kvm_apic_read(offset, result); + + switch (len) { + case 1: + case 2: + case 4: + memcpy(data, (char *)&result + alignment, len); + break; + default: + printk(KERN_ERR "Local APIC read with len = %x, " + "should be 1,2, or 4 instead\n", len); + break; + } + return 0; +} + +static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) +{ + return apic_hw_enabled(apic) && + addr >= apic->base_address && + addr < apic->base_address + LAPIC_MMIO_LENGTH; +} + +static int apic_mmio_read(struct kvm_io_device *this, + gpa_t address, int len, void *data) +{ + struct kvm_lapic *apic = to_lapic(this); + u32 offset = address - apic->base_address; + + if (!apic_mmio_in_range(apic, address)) + return -EOPNOTSUPP; + + apic_reg_read(apic, offset, len, data); + + return 0; +} + +static void update_divide_count(struct kvm_lapic *apic) +{ + u32 tmp1, tmp2, tdcr; + + tdcr = apic_get_reg(apic, APIC_TDCR); + tmp1 = tdcr & 0xf; + tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; + apic->divide_count = 0x1 << (tmp2 & 0x7); + + apic_debug("timer divide count is 0x%x\n", + apic->divide_count); +} + +static void start_apic_timer(struct kvm_lapic *apic) +{ + ktime_t now = apic->lapic_timer.timer.base->get_time(); + + apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) * + APIC_BUS_CYCLE_NS * apic->divide_count; + atomic_set(&apic->lapic_timer.pending, 0); + + if (!apic->lapic_timer.period) + return; + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + if (apic->lapic_timer.period < NSEC_PER_MSEC/2) + apic->lapic_timer.period = NSEC_PER_MSEC/2; + } + + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, apic->lapic_timer.period), + HRTIMER_MODE_ABS); + + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + PRIx64 ", " + "timer initial count 0x%x, period %lldns, " + "expire @ 0x%016" PRIx64 ".\n", __func__, + APIC_BUS_CYCLE_NS, ktime_to_ns(now), + apic_get_reg(apic, APIC_TMICT), + apic->lapic_timer.period, + ktime_to_ns(ktime_add_ns(now, + apic->lapic_timer.period))); +} + +static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) +{ + int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); + + if (apic_lvt_nmi_mode(lvt0_val)) { + if (!nmi_wd_enabled) { + apic_debug("Receive NMI setting on APIC_LVT0 " + "for cpu %d\n", apic->vcpu->vcpu_id); + apic->vcpu->kvm->arch.vapics_in_nmi_mode++; + } + } else if (nmi_wd_enabled) + apic->vcpu->kvm->arch.vapics_in_nmi_mode--; +} + +static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) +{ + int ret = 0; + + trace_kvm_apic_write(reg, val); + + switch (reg) { + case APIC_ID: /* Local APIC ID */ + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_ID, val); + else + ret = 1; + break; + + case APIC_TASKPRI: + report_tpr_access(apic, true); + apic_set_tpr(apic, val & 0xff); + break; + + case APIC_EOI: + apic_set_eoi(apic); + break; + + case APIC_LDR: + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + else + ret = 1; + break; + + case APIC_DFR: + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + else + ret = 1; + break; + + case APIC_SPIV: { + u32 mask = 0x3ff; + if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) + mask |= APIC_SPIV_DIRECTED_EOI; + apic_set_reg(apic, APIC_SPIV, val & mask); + if (!(val & APIC_SPIV_APIC_ENABLED)) { + int i; + u32 lvt_val; + + for (i = 0; i < APIC_LVT_NUM; i++) { + lvt_val = apic_get_reg(apic, + APIC_LVTT + 0x10 * i); + apic_set_reg(apic, APIC_LVTT + 0x10 * i, + lvt_val | APIC_LVT_MASKED); + } + atomic_set(&apic->lapic_timer.pending, 0); + + } + break; + } + case APIC_ICR: + /* No delay here, so we always clear the pending bit */ + apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); + apic_send_ipi(apic); + break; + + case APIC_ICR2: + if (!apic_x2apic_mode(apic)) + val &= 0xff000000; + apic_set_reg(apic, APIC_ICR2, val); + break; + + case APIC_LVT0: + apic_manage_nmi_watchdog(apic, val); + case APIC_LVTT: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT1: + case APIC_LVTERR: + /* TODO: Check vector */ + if (!apic_sw_enabled(apic)) + val |= APIC_LVT_MASKED; + + val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; + apic_set_reg(apic, reg, val); + + break; + + case APIC_TMICT: + hrtimer_cancel(&apic->lapic_timer.timer); + apic_set_reg(apic, APIC_TMICT, val); + start_apic_timer(apic); + break; + + case APIC_TDCR: + if (val & 4) + printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); + apic_set_reg(apic, APIC_TDCR, val); + update_divide_count(apic); + break; + + case APIC_ESR: + if (apic_x2apic_mode(apic) && val != 0) { + printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); + ret = 1; + } + break; + + case APIC_SELF_IPI: + if (apic_x2apic_mode(apic)) { + apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); + } else + ret = 1; + break; + default: + ret = 1; + break; + } + if (ret) + apic_debug("Local APIC Write to read-only register %x\n", reg); + return ret; +} + +static int apic_mmio_write(struct kvm_io_device *this, + gpa_t address, int len, const void *data) +{ + struct kvm_lapic *apic = to_lapic(this); + unsigned int offset = address - apic->base_address; + u32 val; + + if (!apic_mmio_in_range(apic, address)) + return -EOPNOTSUPP; + + /* + * APIC register must be aligned on 128-bits boundary. + * 32/64/128 bits registers must be accessed thru 32 bits. + * Refer SDM 8.4.1 + */ + if (len != 4 || (offset & 0xf)) { + /* Don't shout loud, $infamous_os would cause only noise. */ + apic_debug("apic write: bad size=%d %lx\n", len, (long)address); + return 0; + } + + val = *(u32*)data; + + /* too common printing */ + if (offset != APIC_EOI) + apic_debug("%s: offset 0x%x with length 0x%x, and value is " + "0x%x\n", __func__, offset, len, val); + + apic_reg_write(apic, offset & 0xff0, val); + + return 0; +} + +void kvm_free_lapic(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.apic) + return; + + hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); + + if (vcpu->arch.apic->regs_page) + __free_page(vcpu->arch.apic->regs_page); + + kfree(vcpu->arch.apic); +} + +/* + *---------------------------------------------------------------------- + * LAPIC interface + *---------------------------------------------------------------------- + */ + +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!apic) + return; + apic_set_tpr(apic, ((cr8 & 0x0f) << 4) + | (apic_get_reg(apic, APIC_TASKPRI) & 4)); +} + +u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u64 tpr; + + if (!apic) + return 0; + tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); + + return (tpr & 0xf0) >> 4; +} + +void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!apic) { + value |= MSR_IA32_APICBASE_BSP; + vcpu->arch.apic_base = value; + return; + } + + if (!kvm_vcpu_is_bsp(apic->vcpu)) + value &= ~MSR_IA32_APICBASE_BSP; + + vcpu->arch.apic_base = value; + if (apic_x2apic_mode(apic)) { + u32 id = kvm_apic_id(apic); + u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); + apic_set_reg(apic, APIC_LDR, ldr); + } + apic->base_address = apic->vcpu->arch.apic_base & + MSR_IA32_APICBASE_BASE; + + /* with FSB delivery interrupt, we can restart APIC functionality */ + apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " + "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); + +} + +void kvm_lapic_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic; + int i; + + apic_debug("%s\n", __func__); + + ASSERT(vcpu); + apic = vcpu->arch.apic; + ASSERT(apic != NULL); + + /* Stop the timer in case it's a reset to an active apic */ + hrtimer_cancel(&apic->lapic_timer.timer); + + apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); + kvm_apic_set_version(apic->vcpu); + + for (i = 0; i < APIC_LVT_NUM; i++) + apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + apic_set_reg(apic, APIC_LVT0, + SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); + + apic_set_reg(apic, APIC_DFR, 0xffffffffU); + apic_set_reg(apic, APIC_SPIV, 0xff); + apic_set_reg(apic, APIC_TASKPRI, 0); + apic_set_reg(apic, APIC_LDR, 0); + apic_set_reg(apic, APIC_ESR, 0); + apic_set_reg(apic, APIC_ICR, 0); + apic_set_reg(apic, APIC_ICR2, 0); + apic_set_reg(apic, APIC_TDCR, 0); + apic_set_reg(apic, APIC_TMICT, 0); + for (i = 0; i < 8; i++) { + apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); + apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); + apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); + } + apic->irr_pending = false; + update_divide_count(apic); + atomic_set(&apic->lapic_timer.pending, 0); + if (kvm_vcpu_is_bsp(vcpu)) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + apic_update_ppr(apic); + + vcpu->arch.apic_arb_prio = 0; + + apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" + "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, + vcpu, kvm_apic_id(apic), + vcpu->arch.apic_base, apic->base_address); +} + +bool kvm_apic_present(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic); +} + +int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +{ + return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); +} + +/* + *---------------------------------------------------------------------- + * timer interface + *---------------------------------------------------------------------- + */ + +static bool lapic_is_periodic(struct kvm_timer *ktimer) +{ + struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, + lapic_timer); + return apic_lvtt_period(apic); +} + +int apic_has_pending_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *lapic = vcpu->arch.apic; + + if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) + return atomic_read(&lapic->lapic_timer.pending); + + return 0; +} + +static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) +{ + u32 reg = apic_get_reg(apic, lvt_type); + int vector, mode, trig_mode; + + if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { + vector = reg & APIC_VECTOR_MASK; + mode = reg & APIC_MODE_MASK; + trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; + return __apic_accept_irq(apic, mode, vector, 1, trig_mode); + } + return 0; +} + +void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (apic) + kvm_apic_local_deliver(apic, APIC_LVT0); +} + +static struct kvm_timer_ops lapic_timer_ops = { + .is_periodic = lapic_is_periodic, +}; + +static const struct kvm_io_device_ops apic_mmio_ops = { + .read = apic_mmio_read, + .write = apic_mmio_write, +}; + +int kvm_create_lapic(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic; + + ASSERT(vcpu != NULL); + apic_debug("apic_init %d\n", vcpu->vcpu_id); + + apic = kzalloc(sizeof(*apic), GFP_KERNEL); + if (!apic) + goto nomem; + + vcpu->arch.apic = apic; + + apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (apic->regs_page == NULL) { + printk(KERN_ERR "malloc apic regs error for vcpu %x\n", + vcpu->vcpu_id); + goto nomem_free_apic; + } + apic->regs = page_address(apic->regs_page); + apic->vcpu = vcpu; + + hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + apic->lapic_timer.timer.function = kvm_timer_fn; + apic->lapic_timer.t_ops = &lapic_timer_ops; + apic->lapic_timer.kvm = vcpu->kvm; + apic->lapic_timer.vcpu = vcpu; + + apic->base_address = APIC_DEFAULT_PHYS_BASE; + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; + + kvm_lapic_reset(vcpu); + kvm_iodevice_init(&apic->dev, &apic_mmio_ops); + + return 0; +nomem_free_apic: + kfree(apic); +nomem: + return -ENOMEM; +} + +int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int highest_irr; + + if (!apic || !apic_enabled(apic)) + return -1; + + apic_update_ppr(apic); + highest_irr = apic_find_highest_irr(apic); + if ((highest_irr == -1) || + ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) + return -1; + return highest_irr; +} + +int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) +{ + u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); + int r = 0; + + if (!apic_hw_enabled(vcpu->arch.apic)) + r = 1; + if ((lvt0 & APIC_LVT_MASKED) == 0 && + GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) + r = 1; + return r; +} + +void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { + if (kvm_apic_local_deliver(apic, APIC_LVTT)) + atomic_dec(&apic->lapic_timer.pending); + } +} + +int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) +{ + int vector = kvm_apic_has_interrupt(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + + if (vector == -1) + return -1; + + apic_set_vector(vector, apic->regs + APIC_ISR); + apic_update_ppr(apic); + apic_clear_irr(vector, apic); + return vector; +} + +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + apic->base_address = vcpu->arch.apic_base & + MSR_IA32_APICBASE_BASE; + kvm_apic_set_version(vcpu); + + apic_update_ppr(apic); + hrtimer_cancel(&apic->lapic_timer.timer); + update_divide_count(apic); + start_apic_timer(apic); + apic->irr_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + +void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct hrtimer *timer; + + if (!apic) + return; + + timer = &apic->lapic_timer.timer; + if (hrtimer_cancel(timer)) + kvm_hrtimer_start_expires(timer, HRTIMER_MODE_ABS); +} + +void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) +{ + u32 data; + void *vapic; + + if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + return; + + vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); + data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); + kunmap_atomic(vapic, KM_USER0); + + apic_set_tpr(vcpu->arch.apic, data & 0xff); +} + +void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) +{ + u32 data, tpr; + int max_irr, max_isr; + struct kvm_lapic *apic; + void *vapic; + + if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + return; + + apic = vcpu->arch.apic; + tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; + max_irr = apic_find_highest_irr(apic); + if (max_irr < 0) + max_irr = 0; + max_isr = apic_find_highest_isr(apic); + if (max_isr < 0) + max_isr = 0; + data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); + + vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); + *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; + kunmap_atomic(vapic, KM_USER0); +} + +void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) +{ + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + vcpu->arch.apic->vapic_addr = vapic_addr; +} + +int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = (msr - APIC_BASE_MSR) << 4; + + if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + return 1; + + /* if this is ICR write vector before command */ + if (msr == 0x830) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (msr == 0x830) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + /* if this is ICR write vector before command */ + if (reg == APIC_ICR) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (reg == APIC_ICR) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/linux/x86/lapic.h b/linux/x86/lapic.h new file mode 100644 index 0000000..f5fe32c --- /dev/null +++ b/linux/x86/lapic.h @@ -0,0 +1,59 @@ +#ifndef __KVM_X86_LAPIC_H +#define __KVM_X86_LAPIC_H + +#include "iodev.h" +#include "kvm_timer.h" + +#include <linux/kvm_host.h> + +struct kvm_lapic { + unsigned long base_address; + struct kvm_io_device dev; + struct kvm_timer lapic_timer; + u32 divide_count; + struct kvm_vcpu *vcpu; + bool irr_pending; + struct page *regs_page; + void *regs; + gpa_t vapic_addr; + struct page *vapic_page; +}; +int kvm_create_lapic(struct kvm_vcpu *vcpu); +void kvm_free_lapic(struct kvm_vcpu *vcpu); + +int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); +int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); +void kvm_lapic_reset(struct kvm_vcpu *vcpu); +u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); +void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); +u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); +void kvm_apic_set_version(struct kvm_vcpu *vcpu); + +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); + +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); +int kvm_lapic_enabled(struct kvm_vcpu *vcpu); +bool kvm_apic_present(struct kvm_vcpu *vcpu); +int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); + +void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); +void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); +void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); + +int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; +} +#endif diff --git a/linux/x86/mmu.c b/linux/x86/mmu.c new file mode 100644 index 0000000..9713914 --- /dev/null +++ b/linux/x86/mmu.c @@ -0,0 +1,3887 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "irq.h" +#include "mmu.h" +#include "x86.h" +#include "kvm_cache_regs.h" +#include "x86.h" + +#include <linux/kvm_host.h> +#include <asm/types.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/hugetlb.h> + +#include <linux/srcu.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#include <asm/page.h> +#include <asm/cmpxchg.h> +#include <asm/io.h> +#include <asm/vmx.h> + +/* + * When setting this variable to true it enables Two-Dimensional-Paging + * where the hardware walks 2 page tables: + * 1. the guest-virtual to guest-physical + * 2. while doing 1. it walks guest-physical to host-physical + * If the hardware supports that we don't need to do shadow paging. + */ +bool tdp_enabled = false; + +enum { + AUDIT_PRE_PAGE_FAULT, + AUDIT_POST_PAGE_FAULT, + AUDIT_PRE_PTE_WRITE, + AUDIT_POST_PTE_WRITE, + AUDIT_PRE_SYNC, + AUDIT_POST_SYNC +}; + +char *audit_point_name[] = { + "pre page fault", + "post page fault", + "pre pte write", + "post pte write", + "pre sync", + "post sync" +}; + +#undef MMU_DEBUG + +#ifdef MMU_DEBUG + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) + +#else + +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) + +#endif + +#ifdef MMU_DEBUG +static int dbg = 0; +module_param(dbg, bool, 0644); +#endif + +static int oos_shadow = 1; +module_param(oos_shadow, bool, 0644); + +#ifndef MMU_DEBUG +#define ASSERT(x) do { } while (0) +#else +#define ASSERT(x) \ + if (!(x)) { \ + printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ + __FILE__, __LINE__, #x); \ + } +#endif + +#define PTE_PREFETCH_NUM 8 + +#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT64_SECOND_AVAIL_BITS_SHIFT 52 + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) + +#define PT64_LEVEL_MASK(level) \ + (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) + + +#define PT32_LEVEL_BITS 10 + +#define PT32_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) + +#define PT32_LEVEL_MASK(level) \ + (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) +#define PT32_LVL_OFFSET_MASK(level) \ + (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT32_LEVEL_BITS))) - 1)) + +#define PT32_INDEX(address, level)\ + (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) + + +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_DIR_BASE_ADDR_MASK \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) +#define PT64_LVL_ADDR_MASK(level) \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) + +#define PT32_BASE_ADDR_MASK PAGE_MASK +#define PT32_DIR_BASE_ADDR_MASK \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) +#define PT32_LVL_ADDR_MASK(level) \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT32_LEVEL_BITS))) - 1)) + +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ + | PT64_NX_MASK) + +#define RMAP_EXT 4 + +#define ACC_EXEC_MASK 1 +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) + +#include <trace/events/kvm.h> + +#define CREATE_TRACE_POINTS +#include "mmutrace.h" + +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + +struct kvm_rmap_desc { + u64 *sptes[RMAP_EXT]; + struct kvm_rmap_desc *more; +}; + +struct kvm_shadow_walk_iterator { + u64 addr; + hpa_t shadow_addr; + int level; + u64 *sptep; + unsigned index; +}; + +#define for_each_shadow_entry(_vcpu, _addr, _walker) \ + for (shadow_walk_init(&(_walker), _vcpu, _addr); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + +typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); + +static struct kmem_cache *pte_chain_cache; +static struct kmem_cache *rmap_desc_cache; +static struct kmem_cache *mmu_page_header_cache; +static struct percpu_counter kvm_total_used_mmu_pages; + +static u64 __read_mostly shadow_trap_nonpresent_pte; +static u64 __read_mostly shadow_notrap_nonpresent_pte; +static u64 __read_mostly shadow_nx_mask; +static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +static u64 __read_mostly shadow_user_mask; +static u64 __read_mostly shadow_accessed_mask; +static u64 __read_mostly shadow_dirty_mask; + +static inline u64 rsvd_bits(int s, int e) +{ + return ((1ULL << (e - s + 1)) - 1) << s; +} + +void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) +{ + shadow_trap_nonpresent_pte = trap_pte; + shadow_notrap_nonpresent_pte = notrap_pte; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); + +void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, + u64 dirty_mask, u64 nx_mask, u64 x_mask) +{ + shadow_user_mask = user_mask; + shadow_accessed_mask = accessed_mask; + shadow_dirty_mask = dirty_mask; + shadow_nx_mask = nx_mask; + shadow_x_mask = x_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); + +static bool is_write_protection(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_WP); +} + +static int is_cpuid_PSE36(void) +{ + return 1; +} + +static int is_nx(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.efer & EFER_NX; +} + +static int is_shadow_present_pte(u64 pte) +{ + return pte != shadow_trap_nonpresent_pte + && pte != shadow_notrap_nonpresent_pte; +} + +static int is_large_pte(u64 pte) +{ + return pte & PT_PAGE_SIZE_MASK; +} + +static int is_writable_pte(unsigned long pte) +{ + return pte & PT_WRITABLE_MASK; +} + +static int is_dirty_gpte(unsigned long pte) +{ + return pte & PT_DIRTY_MASK; +} + +static int is_rmap_spte(u64 pte) +{ + return is_shadow_present_pte(pte); +} + +static int is_last_spte(u64 pte, int level) +{ + if (level == PT_PAGE_TABLE_LEVEL) + return 1; + if (is_large_pte(pte)) + return 1; + return 0; +} + +static pfn_t spte_to_pfn(u64 pte) +{ + return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; +} + +static gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} + +static void __set_spte(u64 *sptep, u64 spte) +{ + kvm_set_64bit(sptep, spte); +} + +static u64 __xchg_spte(u64 *sptep, u64 new_spte) +{ +#ifdef CONFIG_X86_64 + return xchg(sptep, new_spte); +#else + u64 old_spte; + + do { + old_spte = *sptep; + } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); + + return old_spte; +#endif +} + +static bool spte_has_volatile_bits(u64 spte) +{ + if (!shadow_accessed_mask) + return false; + + if (!is_shadow_present_pte(spte)) + return false; + + if ((spte & shadow_accessed_mask) && + (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) + return false; + + return true; +} + +static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) +{ + return (old_spte & bit_mask) && !(new_spte & bit_mask); +} + +static void update_spte(u64 *sptep, u64 new_spte) +{ + u64 mask, old_spte = *sptep; + + WARN_ON(!is_rmap_spte(new_spte)); + + new_spte |= old_spte & shadow_dirty_mask; + + mask = shadow_accessed_mask; + if (is_writable_pte(old_spte)) + mask |= shadow_dirty_mask; + + if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) + __set_spte(sptep, new_spte); + else + old_spte = __xchg_spte(sptep, new_spte); + + if (!shadow_accessed_mask) + return; + + if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); + if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); +} + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + struct kmem_cache *base_cache, int min) +{ + void *obj; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + if (!obj) + return -ENOMEM; + cache->objects[cache->nobjs++] = obj; + } + return 0; +} + +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, + struct kmem_cache *cache) +{ + while (mc->nobjs) + kmem_cache_free(cache, mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, + int min) +{ + struct page *page; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + cache->objects[cache->nobjs++] = page_address(page); + } + return 0; +} + +static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + free_page((unsigned long)mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +{ + int r; + + r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, + pte_chain_cache, 4); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, + rmap_desc_cache, 4 + PTE_PREFETCH_NUM); + if (r) + goto out; + r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + mmu_page_header_cache, 4); +out: + return r; +} + +static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) +{ + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); + mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, + mmu_page_header_cache); +} + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, + size_t size) +{ + void *p; + + BUG_ON(!mc->nobjs); + p = mc->objects[--mc->nobjs]; + return p; +} + +static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, + sizeof(struct kvm_pte_chain)); +} + +static void mmu_free_pte_chain(struct kvm_pte_chain *pc) +{ + kmem_cache_free(pte_chain_cache, pc); +} + +static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, + sizeof(struct kvm_rmap_desc)); +} + +static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) +{ + kmem_cache_free(rmap_desc_cache, rd); +} + +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) +{ + if (!sp->role.direct) + return sp->gfns[index]; + + return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); +} + +static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +{ + if (sp->role.direct) + BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); + else + sp->gfns[index] = gfn; +} + +/* + * Return the pointer to the large page information for a given gfn, + * handling slots that are not large page aligned. + */ +static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, + struct kvm_memory_slot *slot, + int level) +{ + unsigned long idx; + + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); + return &slot->lpage_info[level - 2][idx]; +} + +static void account_shadowed(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + struct kvm_lpage_info *linfo; + int i; + + slot = gfn_to_memslot(kvm, gfn); + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + linfo = lpage_info_slot(gfn, slot, i); + linfo->write_count += 1; + } +} + +static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + struct kvm_lpage_info *linfo; + int i; + + slot = gfn_to_memslot(kvm, gfn); + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + linfo = lpage_info_slot(gfn, slot, i); + linfo->write_count -= 1; + WARN_ON(linfo->write_count < 0); + } +} + +static int has_wrprotected_page(struct kvm *kvm, + gfn_t gfn, + int level) +{ + struct kvm_memory_slot *slot; + struct kvm_lpage_info *linfo; + + slot = gfn_to_memslot(kvm, gfn); + if (slot) { + linfo = lpage_info_slot(gfn, slot, level); + return linfo->write_count; + } + + return 1; +} + +static int host_mapping_level(struct kvm *kvm, gfn_t gfn) +{ + unsigned long page_size; + int i, ret = 0; + + page_size = kvm_host_page_size(kvm, gfn); + + for (i = PT_PAGE_TABLE_LEVEL; + i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { + if (page_size >= KVM_HPAGE_SIZE(i)) + ret = i; + else + break; + } + + return ret; +} + +static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + struct kvm_memory_slot *slot; + slot = gfn_to_memslot(vcpu->kvm, large_gfn); + if (slot && slot->dirty_bitmap) + return true; + return false; +} + +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + int host_level, level, max_level; + + host_level = host_mapping_level(vcpu->kvm, large_gfn); + + if (host_level == PT_PAGE_TABLE_LEVEL) + return host_level; + + max_level = kvm_x86_ops->get_lpage_level() < host_level ? + kvm_x86_ops->get_lpage_level() : host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) + if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) + break; + + return level - 1; +} + +/* + * Take gfn and return the reverse mapping to it. + */ + +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) +{ + struct kvm_memory_slot *slot; + struct kvm_lpage_info *linfo; + + slot = gfn_to_memslot(kvm, gfn); + if (likely(level == PT_PAGE_TABLE_LEVEL)) + return &slot->rmap[gfn - slot->base_gfn]; + + linfo = lpage_info_slot(gfn, slot, level); + + return &linfo->rmap_pde; +} + +/* + * Reverse mapping data structures: + * + * If rmapp bit zero is zero, then rmapp point to the shadw page table entry + * that points to page_address(page). + * + * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc + * containing more mappings. + * + * Returns the number of rmap entries before the spte was added or zero if + * the spte was not added. + * + */ +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +{ + struct kvm_mmu_page *sp; + struct kvm_rmap_desc *desc; + unsigned long *rmapp; + int i, count = 0; + + if (!is_rmap_spte(*spte)) + return count; + sp = page_header(__pa(spte)); + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + if (!*rmapp) { + rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); + *rmapp = (unsigned long)spte; + } else if (!(*rmapp & 1)) { + rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); + desc = mmu_alloc_rmap_desc(vcpu); + desc->sptes[0] = (u64 *)*rmapp; + desc->sptes[1] = spte; + *rmapp = (unsigned long)desc | 1; + ++count; + } else { + rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + while (desc->sptes[RMAP_EXT-1] && desc->more) { + desc = desc->more; + count += RMAP_EXT; + } + if (desc->sptes[RMAP_EXT-1]) { + desc->more = mmu_alloc_rmap_desc(vcpu); + desc = desc->more; + } + for (i = 0; desc->sptes[i]; ++i) + ++count; + desc->sptes[i] = spte; + } + return count; +} + +static void rmap_desc_remove_entry(unsigned long *rmapp, + struct kvm_rmap_desc *desc, + int i, + struct kvm_rmap_desc *prev_desc) +{ + int j; + + for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) + ; + desc->sptes[i] = desc->sptes[j]; + desc->sptes[j] = NULL; + if (j != 0) + return; + if (!prev_desc && !desc->more) + *rmapp = (unsigned long)desc->sptes[0]; + else + if (prev_desc) + prev_desc->more = desc->more; + else + *rmapp = (unsigned long)desc->more | 1; + mmu_free_rmap_desc(desc); +} + +static void rmap_remove(struct kvm *kvm, u64 *spte) +{ + struct kvm_rmap_desc *desc; + struct kvm_rmap_desc *prev_desc; + struct kvm_mmu_page *sp; + gfn_t gfn; + unsigned long *rmapp; + int i; + + sp = page_header(__pa(spte)); + gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); + if (!*rmapp) { + printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); + BUG(); + } else if (!(*rmapp & 1)) { + rmap_printk("rmap_remove: %p 1->0\n", spte); + if ((u64 *)*rmapp != spte) { + printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); + BUG(); + } + *rmapp = 0; + } else { + rmap_printk("rmap_remove: %p many->many\n", spte); + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + prev_desc = NULL; + while (desc) { + for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) + if (desc->sptes[i] == spte) { + rmap_desc_remove_entry(rmapp, + desc, i, + prev_desc); + return; + } + prev_desc = desc; + desc = desc->more; + } + pr_err("rmap_remove: %p many->many\n", spte); + BUG(); + } +} + +static int set_spte_track_bits(u64 *sptep, u64 new_spte) +{ + pfn_t pfn; + u64 old_spte = *sptep; + + if (!spte_has_volatile_bits(old_spte)) + __set_spte(sptep, new_spte); + else + old_spte = __xchg_spte(sptep, new_spte); + + if (!is_rmap_spte(old_spte)) + return 0; + + pfn = spte_to_pfn(old_spte); + if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) + kvm_set_pfn_accessed(pfn); + if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) + kvm_set_pfn_dirty(pfn); + return 1; +} + +static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +{ + if (set_spte_track_bits(sptep, new_spte)) + rmap_remove(kvm, sptep); +} + +static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) +{ + struct kvm_rmap_desc *desc; + u64 *prev_spte; + int i; + + if (!*rmapp) + return NULL; + else if (!(*rmapp & 1)) { + if (!spte) + return (u64 *)*rmapp; + return NULL; + } + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + prev_spte = NULL; + while (desc) { + for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { + if (prev_spte == spte) + return desc->sptes[i]; + prev_spte = desc->sptes[i]; + } + desc = desc->more; + } + return NULL; +} + +static int rmap_write_protect(struct kvm *kvm, u64 gfn) +{ + unsigned long *rmapp; + u64 *spte; + int i, write_protected = 0; + + rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!spte); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); + if (is_writable_pte(*spte)) { + update_spte(spte, *spte & ~PT_WRITABLE_MASK); + write_protected = 1; + } + spte = rmap_next(kvm, rmapp, spte); + } + + /* check for huge page mappings */ + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + rmapp = gfn_to_rmap(kvm, gfn, i); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!spte); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); + pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); + if (is_writable_pte(*spte)) { + drop_spte(kvm, spte, + shadow_trap_nonpresent_pte); + --kvm->stat.lpages; + spte = NULL; + write_protected = 1; + } + spte = rmap_next(kvm, rmapp, spte); + } + } + + return write_protected; +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + u64 *spte; + int need_tlb_flush = 0; + + while ((spte = rmap_next(kvm, rmapp, NULL))) { + BUG_ON(!(*spte & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); + need_tlb_flush = 1; + } + return need_tlb_flush; +} + +static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + int need_flush = 0; + u64 *spte, new_spte; + pte_t *ptep = (pte_t *)data; + pfn_t new_pfn; + + WARN_ON(pte_huge(*ptep)); + new_pfn = pte_pfn(*ptep); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!is_shadow_present_pte(*spte)); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); + need_flush = 1; + if (pte_write(*ptep)) { + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); + spte = rmap_next(kvm, rmapp, NULL); + } else { + new_spte = *spte &~ (PT64_BASE_ADDR_MASK); + new_spte |= (u64)new_pfn << PAGE_SHIFT; + + new_spte &= ~PT_WRITABLE_MASK; + new_spte &= ~SPTE_HOST_WRITEABLE; + new_spte &= ~shadow_accessed_mask; + set_spte_track_bits(spte, new_spte); + spte = rmap_next(kvm, rmapp, spte); + } + } + if (need_flush) + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + unsigned long data, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long data)) +{ + int i, j; + int ret; + int retval = 0; + struct kvm_memslots *slots; + + slots = kvm_memslots(kvm); + + for (i = 0; i < slots->nmemslots; i++) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; + unsigned long start = memslot->userspace_addr; + unsigned long end; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + gfn_t gfn = memslot->base_gfn + gfn_offset; + + ret = handler(kvm, &memslot->rmap[gfn_offset], data); + + for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { + struct kvm_lpage_info *linfo; + + linfo = lpage_info_slot(gfn, memslot, + PT_DIRECTORY_LEVEL + j); + ret |= handler(kvm, &linfo->rmap_pde, data); + } + trace_kvm_age_page(hva, memslot, ret); + retval |= ret; + } + } + + return retval; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + u64 *spte; + int young = 0; + + /* + * Emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ + if (!shadow_accessed_mask) + return kvm_unmap_rmapp(kvm, rmapp, data); + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + int _young; + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + _young = _spte & PT_ACCESSED_MASK; + if (_young) { + young = 1; + clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + } + spte = rmap_next(kvm, rmapp, spte); + } + return young; +} + +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + u64 *spte; + int young = 0; + + /* + * If there's no access bit in the secondary pte set by the + * hardware it's up to gup-fast/gup to set the access bit in + * the primary pte or in the page structure. + */ + if (!shadow_accessed_mask) + goto out; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + young = _spte & PT_ACCESSED_MASK; + if (young) { + young = 1; + break; + } + spte = rmap_next(kvm, rmapp, spte); + } +out: + return young; +} + +#define RMAP_RECYCLE_THRESHOLD 1000 + +static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +{ + unsigned long *rmapp; + struct kvm_mmu_page *sp; + + sp = page_header(__pa(spte)); + + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + + kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); + kvm_flush_remote_tlbs(vcpu->kvm); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); +} + +#ifdef MMU_DEBUG +static int is_empty_shadow_page(u64 *spt) +{ + u64 *pos; + u64 *end; + + for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) + if (is_shadow_present_pte(*pos)) { + printk(KERN_ERR "%s: %p %llx\n", __func__, + pos, *pos); + return 0; + } + return 1; +} +#endif + +/* + * This value is the sum of all of the kvm instances's + * kvm->arch.n_used_mmu_pages values. We need a global, + * aggregate version in order to make the slab shrinker + * faster + */ +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) +{ + kvm->arch.n_used_mmu_pages += nr; + percpu_counter_add(&kvm_total_used_mmu_pages, nr); +} + +static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + ASSERT(is_empty_shadow_page(sp->spt)); + hlist_del(&sp->hash_link); + list_del(&sp->link); + __free_page(virt_to_page(sp->spt)); + if (!sp->role.direct) + __free_page(virt_to_page(sp->gfns)); + kmem_cache_free(mmu_page_header_cache, sp); + kvm_mod_used_mmu_pages(kvm, -1); +} + +static unsigned kvm_page_table_hashfn(gfn_t gfn) +{ + return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); +} + +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, + u64 *parent_pte, int direct) +{ + struct kvm_mmu_page *sp; + + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + if (!direct) + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, + PAGE_SIZE); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); + sp->multimapped = 0; + sp->parent_pte = parent_pte; + kvm_mod_used_mmu_pages(vcpu->kvm, +1); + return sp; +} + +static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!parent_pte) + return; + if (!sp->multimapped) { + u64 *old = sp->parent_pte; + + if (!old) { + sp->parent_pte = parent_pte; + return; + } + sp->multimapped = 1; + pte_chain = mmu_alloc_pte_chain(vcpu); + INIT_HLIST_HEAD(&sp->parent_ptes); + hlist_add_head(&pte_chain->link, &sp->parent_ptes); + pte_chain->parent_ptes[0] = old; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { + if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) + continue; + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) + if (!pte_chain->parent_ptes[i]) { + pte_chain->parent_ptes[i] = parent_pte; + return; + } + } + pte_chain = mmu_alloc_pte_chain(vcpu); + BUG_ON(!pte_chain); + hlist_add_head(&pte_chain->link, &sp->parent_ptes); + pte_chain->parent_ptes[0] = parent_pte; +} + +static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, + u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!sp->multimapped) { + BUG_ON(sp->parent_pte != parent_pte); + sp->parent_pte = NULL; + return; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + if (pte_chain->parent_ptes[i] != parent_pte) + continue; + while (i + 1 < NR_PTE_CHAIN_ENTRIES + && pte_chain->parent_ptes[i + 1]) { + pte_chain->parent_ptes[i] + = pte_chain->parent_ptes[i + 1]; + ++i; + } + pte_chain->parent_ptes[i] = NULL; + if (i == 0) { + hlist_del(&pte_chain->link); + mmu_free_pte_chain(pte_chain); + if (hlist_empty(&sp->parent_ptes)) { + sp->multimapped = 0; + sp->parent_pte = NULL; + } + } + return; + } + BUG(); +} + +static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + struct kvm_mmu_page *parent_sp; + int i; + + if (!sp->multimapped && sp->parent_pte) { + parent_sp = page_header(__pa(sp->parent_pte)); + fn(parent_sp, sp->parent_pte); + return; + } + + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + u64 *spte = pte_chain->parent_ptes[i]; + + if (!spte) + break; + parent_sp = page_header(__pa(spte)); + fn(parent_sp, spte); + } +} + +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); +static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) +{ + mmu_parent_walk(sp, mark_unsync); +} + +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) +{ + unsigned int index; + + index = spte - sp->spt; + if (__test_and_set_bit(index, sp->unsync_child_bitmap)) + return; + if (sp->unsync_children++) + return; + kvm_mmu_mark_parents_unsync(sp); +} + +static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + sp->spt[i] = shadow_trap_nonpresent_pte; +} + +static int nonpaging_sync_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + return 1; +} + +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ +} + +#define KVM_PAGE_ARRAY_NR 16 + +struct kvm_mmu_pages { + struct mmu_page_and_offset { + struct kvm_mmu_page *sp; + unsigned int idx; + } page[KVM_PAGE_ARRAY_NR]; + unsigned int nr; +}; + +#define for_each_unsync_children(bitmap, idx) \ + for (idx = find_first_bit(bitmap, 512); \ + idx < 512; \ + idx = find_next_bit(bitmap, 512, idx+1)) + +static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, + int idx) +{ + int i; + + if (sp->unsync) + for (i=0; i < pvec->nr; i++) + if (pvec->page[i].sp == sp) + return 0; + + pvec->page[pvec->nr].sp = sp; + pvec->page[pvec->nr].idx = idx; + pvec->nr++; + return (pvec->nr == KVM_PAGE_ARRAY_NR); +} + +static int __mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + int i, ret, nr_unsync_leaf = 0; + + for_each_unsync_children(sp->unsync_child_bitmap, i) { + struct kvm_mmu_page *child; + u64 ent = sp->spt[i]; + + if (!is_shadow_present_pte(ent) || is_large_pte(ent)) + goto clear_child_bitmap; + + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + goto clear_child_bitmap; + else if (ret > 0) + nr_unsync_leaf += ret; + else + return ret; + } else if (child->unsync) { + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + } else + goto clear_child_bitmap; + + continue; + +clear_child_bitmap: + __clear_bit(i, sp->unsync_child_bitmap); + sp->unsync_children--; + WARN_ON((int)sp->unsync_children < 0); + } + + + return nr_unsync_leaf; +} + +static int mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + if (!sp->unsync_children) + return 0; + + mmu_pages_add(pvec, sp, 0); + return __mmu_unsync_walk(sp, pvec); +} + +static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + WARN_ON(!sp->unsync); + trace_kvm_mmu_sync_page(sp); + sp->unsync = 0; + --kvm->stat.mmu_unsync; +} + +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list); +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list); + +#define for_each_gfn_sp(kvm, sp, gfn, pos) \ + hlist_for_each_entry(sp, pos, \ + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ + if ((sp)->gfn != (gfn)) {} else + +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ + hlist_for_each_entry(sp, pos, \ + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ + if ((sp)->gfn != (gfn) || (sp)->role.direct || \ + (sp)->role.invalid) {} else + +/* @sp->gfn should be write-protected at the call site */ +static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list, bool clear_unsync) +{ + if (sp->role.cr4_pae != !!is_pae(vcpu)) { + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); + return 1; + } + + if (clear_unsync) + kvm_unlink_unsync_page(vcpu->kvm, sp); + + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); + return 1; + } + + kvm_mmu_flush_tlb(vcpu); + return 0; +} + +static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + LIST_HEAD(invalid_list); + int ret; + + ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); + if (ret) + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + + return ret; +} + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) +{ + return __kvm_sync_page(vcpu, sp, invalid_list, true); +} + +/* @gfn should be write-protected at the call site */ +static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_mmu_page *s; + struct hlist_node *node; + LIST_HEAD(invalid_list); + bool flush = false; + + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + if (!s->unsync) + continue; + + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + kvm_unlink_unsync_page(vcpu->kvm, s); + if ((s->role.cr4_pae != !!is_pae(vcpu)) || + (vcpu->arch.mmu.sync_page(vcpu, s))) { + kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); + continue; + } + flush = true; + } + + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + if (flush) + kvm_mmu_flush_tlb(vcpu); +} + +struct mmu_page_path { + struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; + unsigned int idx[PT64_ROOT_LEVEL-1]; +}; + +#define for_each_sp(pvec, sp, parents, i) \ + for (i = mmu_pages_next(&pvec, &parents, -1), \ + sp = pvec.page[i].sp; \ + i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ + i = mmu_pages_next(&pvec, &parents, i)) + +static int mmu_pages_next(struct kvm_mmu_pages *pvec, + struct mmu_page_path *parents, + int i) +{ + int n; + + for (n = i+1; n < pvec->nr; n++) { + struct kvm_mmu_page *sp = pvec->page[n].sp; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) { + parents->idx[0] = pvec->page[n].idx; + return n; + } + + parents->parent[sp->role.level-2] = sp; + parents->idx[sp->role.level-1] = pvec->page[n].idx; + } + + return n; +} + +static void mmu_pages_clear_parents(struct mmu_page_path *parents) +{ + struct kvm_mmu_page *sp; + unsigned int level = 0; + + do { + unsigned int idx = parents->idx[level]; + + sp = parents->parent[level]; + if (!sp) + return; + + --sp->unsync_children; + WARN_ON((int)sp->unsync_children < 0); + __clear_bit(idx, sp->unsync_child_bitmap); + level++; + } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); +} + +static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, + struct mmu_page_path *parents, + struct kvm_mmu_pages *pvec) +{ + parents->parent[parent->role.level-1] = NULL; + pvec->nr = 0; +} + +static void mmu_sync_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *parent) +{ + int i; + struct kvm_mmu_page *sp; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; + LIST_HEAD(invalid_list); + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + int protected = 0; + + for_each_sp(pages, sp, parents, i) + protected |= rmap_write_protect(vcpu->kvm, sp->gfn); + + if (protected) + kvm_flush_remote_tlbs(vcpu->kvm); + + for_each_sp(pages, sp, parents, i) { + kvm_sync_page(vcpu, sp, &invalid_list); + mmu_pages_clear_parents(&parents); + } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + cond_resched_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_pages_init(parent, &parents, &pages); + } +} + +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + gva_t gaddr, + unsigned level, + int direct, + unsigned access, + u64 *parent_pte) +{ + union kvm_mmu_page_role role; + unsigned quadrant; + struct kvm_mmu_page *sp; + struct hlist_node *node; + bool need_sync = false; + + role = vcpu->arch.mmu.base_role; + role.level = level; + role.direct = direct; + if (role.direct) + role.cr4_pae = 0; + role.access = access; + if (!vcpu->arch.mmu.direct_map + && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); + quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; + role.quadrant = quadrant; + } + for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { + if (!need_sync && sp->unsync) + need_sync = true; + + if (sp->role.word != role.word) + continue; + + if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) + break; + + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_mmu_mark_parents_unsync(sp); + } else if (sp->unsync) + kvm_mmu_mark_parents_unsync(sp); + + trace_kvm_mmu_get_page(sp, false); + return sp; + } + ++vcpu->kvm->stat.mmu_cache_miss; + sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); + if (!sp) + return sp; + sp->gfn = gfn; + sp->role = role; + hlist_add_head(&sp->hash_link, + &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); + if (!direct) { + if (rmap_write_protect(vcpu->kvm, gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); + if (level > PT_PAGE_TABLE_LEVEL && need_sync) + kvm_sync_pages(vcpu, gfn); + + account_shadowed(vcpu->kvm, gfn); + } + if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) + vcpu->arch.mmu.prefetch_page(vcpu, sp); + else + nonpaging_prefetch_page(vcpu, sp); + trace_kvm_mmu_get_page(sp, true); + return sp; +} + +static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, u64 addr) +{ + iterator->addr = addr; + iterator->shadow_addr = vcpu->arch.mmu.root_hpa; + iterator->level = vcpu->arch.mmu.shadow_root_level; + + if (iterator->level == PT64_ROOT_LEVEL && + vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && + !vcpu->arch.mmu.direct_map) + --iterator->level; + + if (iterator->level == PT32E_ROOT_LEVEL) { + iterator->shadow_addr + = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + --iterator->level; + if (!iterator->shadow_addr) + iterator->level = 0; + } +} + +static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) +{ + if (iterator->level < PT_PAGE_TABLE_LEVEL) + return false; + + if (iterator->level == PT_PAGE_TABLE_LEVEL) + if (is_large_pte(*iterator->sptep)) + return false; + + iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; + return true; +} + +static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) +{ + iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; + --iterator->level; +} + +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +{ + u64 spte; + + spte = __pa(sp->spt) + | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + __set_spte(sptep, spte); +} + +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ + if (is_large_pte(*sptep)) { + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + +static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned direct_access) +{ + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + + /* + * For the direct sp, if the guest pte's dirty bit + * changed form clean to dirty, it will corrupt the + * sp's access: allow writable in the read-only sp, + * so we should update the spte at this point to get + * a new sp with the correct access. + */ + child = page_header(*sptep & PT64_BASE_ADDR_MASK); + if (child->role.access == direct_access) + return; + + mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + +static void kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp) +{ + unsigned i; + u64 *pt; + u64 ent; + + pt = sp->spt; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + ent = pt[i]; + + if (is_shadow_present_pte(ent)) { + if (!is_last_spte(ent, sp->role.level)) { + ent &= PT64_BASE_ADDR_MASK; + mmu_page_remove_parent_pte(page_header(ent), + &pt[i]); + } else { + if (is_large_pte(ent)) + --kvm->stat.lpages; + drop_spte(kvm, &pt[i], + shadow_trap_nonpresent_pte); + } + } + pt[i] = shadow_trap_nonpresent_pte; + } +} + +static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) +{ + mmu_page_remove_parent_pte(sp, parent_pte); +} + +static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.last_pte_updated = NULL; +} + +static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + u64 *parent_pte; + + while (sp->multimapped || sp->parent_pte) { + if (!sp->multimapped) + parent_pte = sp->parent_pte; + else { + struct kvm_pte_chain *chain; + + chain = container_of(sp->parent_ptes.first, + struct kvm_pte_chain, link); + parent_pte = chain->parent_ptes[0]; + } + BUG_ON(!parent_pte); + kvm_mmu_put_page(sp, parent_pte); + __set_spte(parent_pte, shadow_trap_nonpresent_pte); + } +} + +static int mmu_zap_unsync_children(struct kvm *kvm, + struct kvm_mmu_page *parent, + struct list_head *invalid_list) +{ + int i, zapped = 0; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; + + if (parent->role.level == PT_PAGE_TABLE_LEVEL) + return 0; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + struct kvm_mmu_page *sp; + + for_each_sp(pages, sp, parents, i) { + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + mmu_pages_clear_parents(&parents); + zapped++; + } + kvm_mmu_pages_init(parent, &parents, &pages); + } + + return zapped; +} + +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list) +{ + int ret; + + trace_kvm_mmu_prepare_zap_page(sp); + ++kvm->stat.mmu_shadow_zapped; + ret = mmu_zap_unsync_children(kvm, sp, invalid_list); + kvm_mmu_page_unlink_children(kvm, sp); + kvm_mmu_unlink_parents(kvm, sp); + if (!sp->role.invalid && !sp->role.direct) + unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) + kvm_unlink_unsync_page(kvm, sp); + if (!sp->root_count) { + /* Count self */ + ret++; + list_move(&sp->link, invalid_list); + } else { + list_move(&sp->link, &kvm->arch.active_mmu_pages); + kvm_reload_remote_mmus(kvm); + } + + sp->role.invalid = 1; + kvm_mmu_reset_last_pte_updated(kvm); + return ret; +} + +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list) +{ + struct kvm_mmu_page *sp; + + if (list_empty(invalid_list)) + return; + + kvm_flush_remote_tlbs(kvm); + + do { + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); + WARN_ON(!sp->role.invalid || sp->root_count); + kvm_mmu_free_page(kvm, sp); + } while (!list_empty(invalid_list)); + +} + +/* + * Changing the number of mmu pages allocated to the vm + * Note: if goal_nr_mmu_pages is too small, you will get dead lock + */ +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) +{ + LIST_HEAD(invalid_list); + /* + * If we set the number of mmu pages to be smaller be than the + * number of actived pages , we must to free some mmu pages before we + * change the value + */ + + if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { + while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && + !list_empty(&kvm->arch.active_mmu_pages)) { + struct kvm_mmu_page *page; + + page = container_of(kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + } + goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; + } + + kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; +} + +static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_mmu_page *sp; + struct hlist_node *node; + LIST_HEAD(invalid_list); + int r; + + pgprintk("%s: looking for gfn %llx\n", __func__, gfn); + r = 0; + + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { + pgprintk("%s: gfn %llx role %x\n", __func__, gfn, + sp->role.word); + r = 1; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + } + kvm_mmu_commit_zap_page(kvm, &invalid_list); + return r; +} + +static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_mmu_page *sp; + struct hlist_node *node; + LIST_HEAD(invalid_list); + + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { + pgprintk("%s: zap %llx %x\n", + __func__, gfn, sp->role.word); + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + } + kvm_mmu_commit_zap_page(kvm, &invalid_list); +} + +static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) +{ + int slot = memslot_id(kvm, gfn); + struct kvm_mmu_page *sp = page_header(__pa(pte)); + + __set_bit(slot, sp->slot_bitmap); +} + +static void mmu_convert_notrap(struct kvm_mmu_page *sp) +{ + int i; + u64 *pt = sp->spt; + + if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (pt[i] == shadow_notrap_nonpresent_pte) + __set_spte(&pt[i], shadow_trap_nonpresent_pte); + } +} + +/* + * The function is based on mtrr_type_lookup() in + * arch/x86/kernel/cpu/mtrr/generic.c + */ +static int get_mtrr_type(struct mtrr_state_type *mtrr_state, + u64 start, u64 end) +{ + int i; + u64 base, mask; + u8 prev_match, curr_match; + int num_var_ranges = KVM_NR_VAR_MTRR; + + if (!mtrr_state->enabled) + return 0xFF; + + /* Make end inclusive end, instead of exclusive */ + end--; + + /* Look in fixed ranges. Just return the type as per start */ + if (mtrr_state->have_fixed && (start < 0x100000)) { + int idx; + + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0x1000000) { + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state->fixed_ranges[idx]; + } + } + + /* + * Look in variable ranges + * Look of multiple ranges matching this address and pick type + * as per MTRR precedence + */ + if (!(mtrr_state->enabled & 2)) + return mtrr_state->def_type; + + prev_match = 0xFF; + for (i = 0; i < num_var_ranges; ++i) { + unsigned short start_state, end_state; + + if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) + continue; + + base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + + (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + + (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); + + start_state = ((start & mask) == (base & mask)); + end_state = ((end & mask) == (base & mask)); + if (start_state != end_state) + return 0xFE; + + if ((start & mask) != (base & mask)) + continue; + + curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; + if (prev_match == 0xFF) { + prev_match = curr_match; + continue; + } + + if (prev_match == MTRR_TYPE_UNCACHABLE || + curr_match == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + if ((prev_match == MTRR_TYPE_WRBACK && + curr_match == MTRR_TYPE_WRTHROUGH) || + (prev_match == MTRR_TYPE_WRTHROUGH && + curr_match == MTRR_TYPE_WRBACK)) { + prev_match = MTRR_TYPE_WRTHROUGH; + curr_match = MTRR_TYPE_WRTHROUGH; + } + + if (prev_match != curr_match) + return MTRR_TYPE_UNCACHABLE; + } + + if (prev_match != 0xFF) + return prev_match; + + return mtrr_state->def_type; +} + +u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u8 mtrr; + + mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, + (gfn << PAGE_SHIFT) + PAGE_SIZE); + if (mtrr == 0xfe || mtrr == 0xff) + mtrr = MTRR_TYPE_WRBACK; + return mtrr; +} +EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); + +static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + trace_kvm_mmu_unsync_page(sp); + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; + + kvm_mmu_mark_parents_unsync(sp); + mmu_convert_notrap(sp); +} + +static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_mmu_page *s; + struct hlist_node *node; + + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + if (s->unsync) + continue; + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + __kvm_unsync_page(vcpu, s); + } +} + +static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) +{ + struct kvm_mmu_page *s; + struct hlist_node *node; + bool need_unsync = false; + + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + if (!can_unsync) + return 1; + + if (s->role.level != PT_PAGE_TABLE_LEVEL) + return 1; + + if (!need_unsync && !s->unsync) { + if (!oos_shadow) + return 1; + need_unsync = true; + } + } + if (need_unsync) + kvm_unsync_pages(vcpu, gfn); + return 0; +} + +static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned pte_access, int user_fault, + int write_fault, int dirty, int level, + gfn_t gfn, pfn_t pfn, bool speculative, + bool can_unsync, bool host_writable) +{ + u64 spte, entry = *sptep; + int ret = 0; + + /* + * We don't set the accessed bit, since we sometimes want to see + * whether the guest actually used the pte (in order to detect + * demand paging). + */ + spte = PT_PRESENT_MASK; + if (!speculative) + spte |= shadow_accessed_mask; + if (!dirty) + pte_access &= ~ACC_WRITE_MASK; + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else + spte |= shadow_nx_mask; + if (pte_access & ACC_USER_MASK) + spte |= shadow_user_mask; + if (level > PT_PAGE_TABLE_LEVEL) + spte |= PT_PAGE_SIZE_MASK; + if (tdp_enabled) + spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); + + if (host_writable) + spte |= SPTE_HOST_WRITEABLE; + else + pte_access &= ~ACC_WRITE_MASK; + + spte |= (u64)pfn << PAGE_SHIFT; + + if ((pte_access & ACC_WRITE_MASK) + || (!vcpu->arch.mmu.direct_map && write_fault + && !is_write_protection(vcpu) && !user_fault)) { + + if (level > PT_PAGE_TABLE_LEVEL && + has_wrprotected_page(vcpu->kvm, gfn, level)) { + ret = 1; + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + goto done; + } + + spte |= PT_WRITABLE_MASK; + + if (!vcpu->arch.mmu.direct_map + && !(pte_access & ACC_WRITE_MASK)) + spte &= ~PT_USER_MASK; + + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writable_pte(*sptep)) + goto set_pte; + + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { + pgprintk("%s: found shadow page for %llx, marking ro\n", + __func__, gfn); + ret = 1; + pte_access &= ~ACC_WRITE_MASK; + if (is_writable_pte(spte)) + spte &= ~PT_WRITABLE_MASK; + } + } + + if (pte_access & ACC_WRITE_MASK) + mark_page_dirty(vcpu->kvm, gfn); + +set_pte: + update_spte(sptep, spte); + /* + * If we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB. + */ + if (is_writable_pte(entry) && !is_writable_pte(*sptep)) + kvm_flush_remote_tlbs(vcpu->kvm); +done: + return ret; +} + +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned pt_access, unsigned pte_access, + int user_fault, int write_fault, int dirty, + int *ptwrite, int level, gfn_t gfn, + pfn_t pfn, bool speculative, + bool host_writable) +{ + int was_rmapped = 0; + int rmap_count; + + pgprintk("%s: spte %llx access %x write_fault %d" + " user_fault %d gfn %llx\n", + __func__, *sptep, pt_access, + write_fault, user_fault, gfn); + + if (is_rmap_spte(*sptep)) { + /* + * If we overwrite a PTE page pointer with a 2MB PMD, unlink + * the parent of the now unreachable PTE. + */ + if (level > PT_PAGE_TABLE_LEVEL && + !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + u64 pte = *sptep; + + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } else if (pfn != spte_to_pfn(*sptep)) { + pgprintk("hfn old %llx new %llx\n", + spte_to_pfn(*sptep), pfn); + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } else + was_rmapped = 1; + } + + if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, + dirty, level, gfn, pfn, speculative, true, + host_writable)) { + if (write_fault) + *ptwrite = 1; + kvm_mmu_flush_tlb(vcpu); + } + + pgprintk("%s: setting spte %llx\n", __func__, *sptep); + pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", + is_large_pte(*sptep)? "2MB" : "4kB", + *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, + *sptep, sptep); + if (!was_rmapped && is_large_pte(*sptep)) + ++vcpu->kvm->stat.lpages; + + page_header_update_slot(vcpu->kvm, sptep, gfn); + if (!was_rmapped) { + rmap_count = rmap_add(vcpu, sptep, gfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, sptep, gfn); + } + kvm_release_pfn_clean(pfn); + if (speculative) { + vcpu->arch.last_pte_updated = sptep; + vcpu->arch.last_pte_gfn = gfn; + } +} + +static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) +{ +} + +static struct kvm_memory_slot * +pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID || + (no_dirty_log && slot->dirty_bitmap)) + slot = NULL; + + return slot; +} + +static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, + bool no_dirty_log) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + + slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); + if (!slot) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + + hva = gfn_to_hva_memslot(slot, gfn); + + return hva_to_pfn_atomic(vcpu->kvm, hva); +} + +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *start, u64 *end) +{ + struct page *pages[PTE_PREFETCH_NUM]; + unsigned access = sp->role.access; + int i, ret; + gfn_t gfn; + + gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); + if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) + return -1; + + ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); + if (ret <= 0) + return -1; + + for (i = 0; i < ret; i++, gfn++, start++) + mmu_set_spte(vcpu, start, ACC_ALL, + access, 0, 0, 1, NULL, + sp->role.level, gfn, + page_to_pfn(pages[i]), true, true); + + return 0; +} + +static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *sptep) +{ + u64 *spte, *start = NULL; + int i; + + WARN_ON(!sp->role.direct); + + i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + spte = sp->spt + i; + + for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { + if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { + if (!start) + continue; + if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) + break; + start = NULL; + } else if (!start) + start = spte; + } +} + +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) +{ + struct kvm_mmu_page *sp; + + /* + * Since it's no accessed bit on EPT, it's no way to + * distinguish between actually accessed translations + * and prefetched, so disable pte prefetch if EPT is + * enabled. + */ + if (!shadow_accessed_mask) + return; + + sp = page_header(__pa(sptep)); + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return; + + __direct_pte_prefetch(vcpu, sp, sptep); +} + +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + int map_writable, int level, gfn_t gfn, pfn_t pfn, + bool prefault) +{ + struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp; + int pt_write = 0; + gfn_t pseudo_gfn; + + for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { + if (iterator.level == level) { + unsigned pte_access = ACC_ALL; + + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, + 0, write, 1, &pt_write, + level, gfn, pfn, prefault, map_writable); + direct_pte_prefetch(vcpu, iterator.sptep); + ++vcpu->stat.pf_fixed; + break; + } + + if (*iterator.sptep == shadow_trap_nonpresent_pte) { + u64 base_addr = iterator.addr; + + base_addr &= PT64_LVL_ADDR_MASK(iterator.level); + pseudo_gfn = base_addr >> PAGE_SHIFT; + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, + iterator.level - 1, + 1, ACC_ALL, iterator.sptep); + if (!sp) { + pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_pfn_clean(pfn); + return -ENOMEM; + } + + __set_spte(iterator.sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask + | shadow_accessed_mask); + } + } + return pt_write; +} + +static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) +{ + kvm_siginfo_t info; + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_MCEERR_AR; + info.si_addr = (void *)address; + info.si_addr_lsb = PAGE_SHIFT; + + send_sig_info(SIGBUS, (siginfo_t *)&info, tsk); +} + +static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) +{ + kvm_release_pfn_clean(pfn); + if (is_hwpoison_pfn(pfn)) { + kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); + return 0; + } else if (is_fault_pfn(pfn)) + return -EFAULT; + + return 1; +} + +static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + gfn_t *gfnp, pfn_t *pfnp, int *levelp) +{ + pfn_t pfn = *pfnp; + gfn_t gfn = *gfnp; + int level = *levelp; + + /* + * Check if it's a transparent hugepage. If this would be an + * hugetlbfs page, level wouldn't be set to + * PT_PAGE_TABLE_LEVEL and there would be no adjustment done + * here. + */ + if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + level == PT_PAGE_TABLE_LEVEL && + PageTransCompound(pfn_to_page(pfn)) && + !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { + unsigned long mask; + /* + * mmu_notifier_retry was successful and we hold the + * mmu_lock here, so the pmd can't become splitting + * from under us, and in turn + * __split_huge_page_refcount() can't run from under + * us and we can safely transfer the refcount from + * PG_tail to PG_head as we switch the pfn to tail to + * head. + */ + *levelp = level = PT_DIRECTORY_LEVEL; + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { + gfn &= ~mask; + *gfnp = gfn; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + if (!get_page_unless_zero(pfn_to_page(pfn))) + BUG(); + *pfnp = pfn; + } + } +} + +static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, + gva_t gva, pfn_t *pfn, bool write, bool *writable); + +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, + bool prefault) +{ + int r; + int level; + int force_pt_level; + pfn_t pfn; + unsigned long mmu_seq; + bool map_writable; + + force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (likely(!force_pt_level)) { + level = mapping_level(vcpu, gfn); + /* + * This path builds a PAE pagetable - so we can map + * 2mb pages at maximum. Therefore check if the level + * is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } else + level = PT_PAGE_TABLE_LEVEL; + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + + if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) + return 0; + + /* mmio */ + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); + + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; + kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); + r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, + prefault); + spin_unlock(&vcpu->kvm->mmu_lock); + + + return r; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; +} + + +static void mmu_free_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + LIST_HEAD(invalid_list); + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + spin_lock(&vcpu->kvm->mmu_lock); + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && + (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || + vcpu->arch.mmu.direct_map)) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + sp = page_header(root); + --sp->root_count; + if (!sp->root_count && sp->role.invalid) { + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + } + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + spin_unlock(&vcpu->kvm->mmu_lock); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + --sp->root_count; + if (!sp->root_count && sp->role.invalid) + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); + } + vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; +} + +static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) +{ + int ret = 0; + + if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + ret = 1; + } + + return ret; +} + +static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + unsigned i; + + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, + 1, ACC_ALL, NULL); + ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = __pa(sp->spt); + } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), + i << 30, + PT32_ROOT_LEVEL, 1, ACC_ALL, + NULL); + root = __pa(sp->spt); + ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + } + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + } else + BUG(); + + return 0; +} + +static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + u64 pdptr, pm_mask; + gfn_t root_gfn; + int i; + + root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; + + if (mmu_check_root(vcpu, root_gfn)) + return 1; + + /* + * Do we shadow a long mode page table? If so we need to + * write-protect the guests page table root. + */ + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + ASSERT(!VALID_PAGE(root)); + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, + 0, ACC_ALL, NULL); + root = __pa(sp->spt); + ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = root; + return 0; + } + + /* + * We shadow a 32 bit page table. This may be a legacy 2-level + * or a PAE 3-level page table. In either case we need to be aware that + * the shadow page table may be a PAE or a long mode page table. + */ + pm_mask = PT_PRESENT_MASK; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) + pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { + pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); + if (!is_present_gpte(pdptr)) { + vcpu->arch.mmu.pae_root[i] = 0; + continue; + } + root_gfn = pdptr >> PAGE_SHIFT; + if (mmu_check_root(vcpu, root_gfn)) + return 1; + } + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, + PT32_ROOT_LEVEL, 0, + ACC_ALL, NULL); + root = __pa(sp->spt); + ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + + vcpu->arch.mmu.pae_root[i] = root | pm_mask; + } + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + + /* + * If we shadow a 32 bit page table with a long mode page + * table we enter this path. + */ + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.lm_root == NULL) { + /* + * The additional page necessary for this is only + * allocated on demand. + */ + + u64 *lm_root; + + lm_root = (void*)get_zeroed_page(GFP_KERNEL); + if (lm_root == NULL) + return 1; + + lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; + + vcpu->arch.mmu.lm_root = lm_root; + } + + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); + } + + return 0; +} + +static int mmu_alloc_roots(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mmu.direct_map) + return mmu_alloc_direct_roots(vcpu); + else + return mmu_alloc_shadow_roots(vcpu); +} + +static void mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (vcpu->arch.mmu.direct_map) + return; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + } + } + trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); +} + +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, struct x86_exception *exception) +{ + if (exception) + exception->error_code = 0; + return vaddr; +} + +static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, + struct x86_exception *exception) +{ + if (exception) + exception->error_code = 0; + return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); +} + +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + u32 error_code, bool prefault) +{ + gfn_t gfn; + int r; + + pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + gfn = gva >> PAGE_SHIFT; + + return nonpaging_map(vcpu, gva & PAGE_MASK, + error_code & PFERR_WRITE_MASK, gfn, prefault); +} + +static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) +{ + struct kvm_arch_async_pf arch; + + arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; + arch.gfn = gfn; + arch.direct_map = vcpu->arch.mmu.direct_map; + arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); + + return kvm_setup_async_pf(vcpu, gva, gfn, &arch); +} + +static bool can_do_async_pf(struct kvm_vcpu *vcpu) +{ + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + kvm_event_needs_reinjection(vcpu))) + return false; + + return kvm_x86_ops->interrupt_allowed(vcpu); +} + +static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, + gva_t gva, pfn_t *pfn, bool write, bool *writable) +{ + bool async; + + *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); + + if (!async) + return false; /* *pfn has correct page already */ + + put_page(pfn_to_page(*pfn)); + + if (!prefault && can_do_async_pf(vcpu)) { + trace_kvm_try_async_get_page(gva, gfn); + if (kvm_find_async_pf_gfn(vcpu, gfn)) { + trace_kvm_async_pf_doublefault(gva, gfn); + kvm_make_request(KVM_REQ_APF_HALT, vcpu); + return true; + } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) + return true; + } + + *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); + + return false; +} + +static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + bool prefault) +{ + pfn_t pfn; + int r; + int level; + int force_pt_level; + gfn_t gfn = gpa >> PAGE_SHIFT; + unsigned long mmu_seq; + int write = error_code & PFERR_WRITE_MASK; + bool map_writable; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (likely(!force_pt_level)) { + level = mapping_level(vcpu, gfn); + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } else + level = PT_PAGE_TABLE_LEVEL; + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + return 0; + + /* mmio */ + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; + kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); + r = __direct_map(vcpu, gpa, write, map_writable, + level, gfn, pfn, prefault); + spin_unlock(&vcpu->kvm->mmu_lock); + + return r; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; +} + +static void nonpaging_free(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} + +static int nonpaging_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = nonpaging_page_fault; + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->free = nonpaging_free; + context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; + context->root_level = 0; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + context->direct_map = true; + context->nx = false; + return 0; +} + +void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); +} + +static void paging_new_cr3(struct kvm_vcpu *vcpu) +{ + pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); + mmu_free_roots(vcpu); +} + +static unsigned long get_cr3(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr3(vcpu); +} + +static void inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + vcpu->arch.mmu.inject_page_fault(vcpu, fault); +} + +static void paging_free(struct kvm_vcpu *vcpu) +{ + nonpaging_free(vcpu); +} + +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte >> 7) & 1; + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; +} + +#define PTTYPE 64 +#include "paging_tmpl.h" +#undef PTTYPE + +#define PTTYPE 32 +#include "paging_tmpl.h" +#undef PTTYPE + +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) +{ + int maxphyaddr = cpuid_maxphyaddr(vcpu); + u64 exb_bit_rsvd = 0; + + if (!context->nx) + exb_bit_rsvd = rsvd_bits(63, 63); + switch (level) { + case PT32_ROOT_LEVEL: + /* no rsvd bits for 2 level 4K page table entries */ + context->rsvd_bits_mask[0][1] = 0; + context->rsvd_bits_mask[0][0] = 0; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + + if (!is_pse(vcpu)) { + context->rsvd_bits_mask[1][1] = 0; + break; + } + + if (is_cpuid_PSE36()) + /* 36bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + else + /* 32 bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + break; + case PT32E_ROOT_LEVEL: + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 63) | + rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PDE */ + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PTE */ + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + break; + case PT64_ROOT_LEVEL: + context->rsvd_bits_mask[0][3] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 29); + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + break; + } +} + +static int paging64_init_context_common(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) +{ + context->nx = is_nx(vcpu); + + reset_rsvds_bits_mask(vcpu, context, level); + + ASSERT(is_pae(vcpu)); + context->new_cr3 = paging_new_cr3; + context->page_fault = paging64_page_fault; + context->gva_to_gpa = paging64_gva_to_gpa; + context->prefetch_page = paging64_prefetch_page; + context->sync_page = paging64_sync_page; + context->invlpg = paging64_invlpg; + context->free = paging_free; + context->root_level = level; + context->shadow_root_level = level; + context->root_hpa = INVALID_PAGE; + context->direct_map = false; + return 0; +} + +static int paging64_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); +} + +static int paging32_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + context->nx = false; + + reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); + + context->new_cr3 = paging_new_cr3; + context->page_fault = paging32_page_fault; + context->gva_to_gpa = paging32_gva_to_gpa; + context->free = paging_free; + context->prefetch_page = paging32_prefetch_page; + context->sync_page = paging32_sync_page; + context->invlpg = paging32_invlpg; + context->root_level = PT32_ROOT_LEVEL; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + context->direct_map = false; + return 0; +} + +static int paging32E_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); +} + +static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = vcpu->arch.walk_mmu; + + context->base_role.word = 0; + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = tdp_page_fault; + context->free = nonpaging_free; + context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + context->root_hpa = INVALID_PAGE; + context->direct_map = true; + context->set_cr3 = kvm_x86_ops->set_tdp_cr3; + context->get_cr3 = get_cr3; + context->inject_page_fault = kvm_inject_page_fault; + context->nx = is_nx(vcpu); + + if (!is_paging(vcpu)) { + context->nx = false; + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->root_level = 0; + } else if (is_long_mode(vcpu)) { + context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); + context->gva_to_gpa = paging64_gva_to_gpa; + context->root_level = PT64_ROOT_LEVEL; + } else if (is_pae(vcpu)) { + context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); + context->gva_to_gpa = paging64_gva_to_gpa; + context->root_level = PT32E_ROOT_LEVEL; + } else { + context->nx = false; + reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); + context->gva_to_gpa = paging32_gva_to_gpa; + context->root_level = PT32_ROOT_LEVEL; + } + + return 0; +} + +int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +{ + int r; + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + if (!is_paging(vcpu)) + r = nonpaging_init_context(vcpu, context); + else if (is_long_mode(vcpu)) + r = paging64_init_context(vcpu, context); + else if (is_pae(vcpu)) + r = paging32E_init_context(vcpu, context); + else + r = paging32_init_context(vcpu, context); + + vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); + +static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +{ + int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); + + vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; + vcpu->arch.walk_mmu->get_cr3 = get_cr3; + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + + return r; +} + +static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; + + g_context->get_cr3 = get_cr3; + g_context->inject_page_fault = kvm_inject_page_fault; + + /* + * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The + * translation of l2_gpa to l1_gpa addresses is done using the + * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa + * functions between mmu and nested_mmu are swapped. + */ + if (!is_paging(vcpu)) { + g_context->nx = false; + g_context->root_level = 0; + g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; + } else if (is_long_mode(vcpu)) { + g_context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); + g_context->root_level = PT64_ROOT_LEVEL; + g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + } else if (is_pae(vcpu)) { + g_context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); + g_context->root_level = PT32E_ROOT_LEVEL; + g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + } else { + g_context->nx = false; + reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); + g_context->root_level = PT32_ROOT_LEVEL; + g_context->gva_to_gpa = paging32_gva_to_gpa_nested; + } + + return 0; +} + +static int init_kvm_mmu(struct kvm_vcpu *vcpu) +{ + vcpu->arch.update_pte.pfn = bad_pfn; + + if (mmu_is_nested(vcpu)) + return init_kvm_nested_mmu(vcpu); + else if (tdp_enabled) + return init_kvm_tdp_mmu(vcpu); + else + return init_kvm_softmmu(vcpu); +} + +static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) + /* mmu.free() should set root_hpa = INVALID_PAGE */ + vcpu->arch.mmu.free(vcpu); +} + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); + +int kvm_mmu_load(struct kvm_vcpu *vcpu) +{ + int r; + + r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + r = mmu_alloc_roots(vcpu); + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); + if (r) + goto out; + /* set_cr3() should ensure TLB has been flushed */ + vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); +out: + return r; +} +EXPORT_SYMBOL_GPL(kvm_mmu_load); + +void kvm_mmu_unload(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_mmu_unload); + +static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *spte) +{ + u64 pte; + struct kvm_mmu_page *child; + + pte = *spte; + if (is_shadow_present_pte(pte)) { + if (is_last_spte(pte, sp->role.level)) + drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); + else { + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, spte); + } + } + __set_spte(spte, shadow_trap_nonpresent_pte); + if (is_large_pte(pte)) + --vcpu->kvm->stat.lpages; +} + +static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *spte, + const void *new) +{ + if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + ++vcpu->kvm->stat.mmu_pde_zapped; + return; + } + + ++vcpu->kvm->stat.mmu_pte_updated; + if (!sp->role.cr4_pae) + paging32_update_pte(vcpu, sp, spte, new); + else + paging64_update_pte(vcpu, sp, spte, new); +} + +static bool need_remote_flush(u64 old, u64 new) +{ + if (!is_shadow_present_pte(old)) + return false; + if (!is_shadow_present_pte(new)) + return true; + if ((old ^ new) & PT64_BASE_ADDR_MASK) + return true; + old ^= PT64_NX_MASK; + new ^= PT64_NX_MASK; + return (old & ~new & PT64_PERM_MASK) != 0; +} + +static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, + bool remote_flush, bool local_flush) +{ + if (zap_page) + return; + + if (remote_flush) + kvm_flush_remote_tlbs(vcpu->kvm); + else if (local_flush) + kvm_mmu_flush_tlb(vcpu); +} + +static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) +{ + u64 *spte = vcpu->arch.last_pte_updated; + + return !!(spte && (*spte & shadow_accessed_mask)); +} + +static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + u64 gpte) +{ + gfn_t gfn; + pfn_t pfn; + + if (!is_present_gpte(gpte)) + return; + gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + + vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + pfn = gfn_to_pfn(vcpu->kvm, gfn); + + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return; + } + vcpu->arch.update_pte.gfn = gfn; + vcpu->arch.update_pte.pfn = pfn; +} + +static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u64 *spte = vcpu->arch.last_pte_updated; + + if (spte + && vcpu->arch.last_pte_gfn == gfn + && shadow_accessed_mask + && !(*spte & shadow_accessed_mask) + && is_shadow_present_pte(*spte)) + set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); +} + +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes, + bool guest_initiated) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + union kvm_mmu_page_role mask = { .word = 0 }; + struct kvm_mmu_page *sp; + struct hlist_node *node; + LIST_HEAD(invalid_list); + u64 entry, gentry; + u64 *spte; + unsigned offset = offset_in_page(gpa); + unsigned pte_size; + unsigned page_offset; + unsigned misaligned; + unsigned quadrant; + int level; + int flooded = 0; + int npte; + int r; + int invlpg_counter; + bool remote_flush, local_flush, zap_page; + + zap_page = remote_flush = local_flush = false; + + pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); + + invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); + + /* + * Assume that the pte write on a page table of the same type + * as the current vcpu paging mode. This is nearly always true + * (might be false while changing modes). Note it is verified later + * by update_pte(). + */ + if ((is_pae(vcpu) && bytes == 4) || !new) { + /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ + if (is_pae(vcpu)) { + gpa &= ~(gpa_t)7; + bytes = 8; + } + r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); + if (r) + gentry = 0; + new = (const u8 *)&gentry; + } + + switch (bytes) { + case 4: + gentry = *(const u32 *)new; + break; + case 8: + gentry = *(const u64 *)new; + break; + default: + gentry = 0; + break; + } + + mmu_guess_page_from_pte_write(vcpu, gpa, gentry); + spin_lock(&vcpu->kvm->mmu_lock); + if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) + gentry = 0; + kvm_mmu_access_page(vcpu, gfn); + kvm_mmu_free_some_pages(vcpu); + ++vcpu->kvm->stat.mmu_pte_write; + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); + if (guest_initiated) { + if (gfn == vcpu->arch.last_pt_write_gfn + && !last_updated_pte_accessed(vcpu)) { + ++vcpu->arch.last_pt_write_count; + if (vcpu->arch.last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->arch.last_pt_write_gfn = gfn; + vcpu->arch.last_pt_write_count = 1; + vcpu->arch.last_pte_updated = NULL; + } + } + + mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { + pte_size = sp->role.cr4_pae ? 8 : 4; + misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; + if (misaligned || flooded) { + /* + * Misaligned accesses are too much trouble to fix + * up; also, they usually indicate a page is not used + * as a page table. + * + * If we're seeing too many writes to a page, + * it may no longer be a page table, or we may be + * forking, in which case it is better to unmap the + * page. + */ + pgprintk("misaligned: gpa %llx bytes %d role %x\n", + gpa, bytes, sp->role.word); + zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); + ++vcpu->kvm->stat.mmu_flooded; + continue; + } + page_offset = offset; + level = sp->role.level; + npte = 1; + if (!sp->role.cr4_pae) { + page_offset <<= 1; /* 32->64 */ + /* + * A 32-bit pde maps 4MB while the shadow pdes map + * only 2MB. So we need to double the offset again + * and zap two pdes instead of one. + */ + if (level == PT32_ROOT_LEVEL) { + page_offset &= ~7; /* kill rounding error */ + page_offset <<= 1; + npte = 2; + } + quadrant = page_offset >> PAGE_SHIFT; + page_offset &= ~PAGE_MASK; + if (quadrant != sp->role.quadrant) + continue; + } + local_flush = true; + spte = &sp->spt[page_offset / sizeof(*spte)]; + while (npte--) { + entry = *spte; + mmu_pte_write_zap_pte(vcpu, sp, spte); + if (gentry && + !((sp->role.word ^ vcpu->arch.mmu.base_role.word) + & mask.word)) + mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); + if (!remote_flush && need_remote_flush(entry, *spte)) + remote_flush = true; + ++spte; + } + } + mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); + spin_unlock(&vcpu->kvm->mmu_lock); + if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { + kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); + vcpu->arch.update_pte.pfn = bad_pfn; + } +} + +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + int r; + + if (vcpu->arch.mmu.direct_map) + return 0; + + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + + spin_lock(&vcpu->kvm->mmu_lock); + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + spin_unlock(&vcpu->kvm->mmu_lock); + return r; +} +EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); + +void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + LIST_HEAD(invalid_list); + + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && + !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { + struct kvm_mmu_page *sp; + + sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + ++vcpu->kvm->stat.mmu_recycled; + } +} + +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, + void *insn, int insn_len) +{ + int r; + enum emulation_result er; + + r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + if (r < 0) + goto out; + + if (!r) { + r = 1; + goto out; + } + + r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + + er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); + + switch (er) { + case EMULATE_DONE: + return 1; + case EMULATE_DO_MMIO: + ++vcpu->stat.mmio_exits; + /* fall through */ + case EMULATE_FAIL: + return 0; + default: + BUG(); + } +out: + return r; +} +EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); + +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ + vcpu->arch.mmu.invlpg(vcpu, gva); + kvm_mmu_flush_tlb(vcpu); + ++vcpu->stat.invlpg; +} +EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); + +void kvm_enable_tdp(void) +{ + tdp_enabled = true; +} +EXPORT_SYMBOL_GPL(kvm_enable_tdp); + +void kvm_disable_tdp(void) +{ + tdp_enabled = false; +} +EXPORT_SYMBOL_GPL(kvm_disable_tdp); + +static void free_mmu_pages(struct kvm_vcpu *vcpu) +{ + free_page((unsigned long)vcpu->arch.mmu.pae_root); + if (vcpu->arch.mmu.lm_root != NULL) + free_page((unsigned long)vcpu->arch.mmu.lm_root); +} + +static int alloc_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct page *page; + int i; + + ASSERT(vcpu); + + /* + * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. + * Therefore we need to allocate shadow page tables in the first + * 4GB of memory, which happens to fit the DMA32 zone. + */ + page = alloc_page(GFP_KERNEL | __GFP_DMA32); + if (!page) + return -ENOMEM; + + vcpu->arch.mmu.pae_root = page_address(page); + for (i = 0; i < 4; ++i) + vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + + return 0; +} + +int kvm_mmu_create(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + return alloc_mmu_pages(vcpu); +} + +int kvm_mmu_setup(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + return init_kvm_mmu(vcpu); +} + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ + struct kvm_mmu_page *sp; + + list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { + int i; + u64 *pt; + + if (!test_bit(slot, sp->slot_bitmap)) + continue; + + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + continue; + + pt = sp->spt; + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + /* avoid RMW */ + if (is_writable_pte(pt[i])) + update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); + } + kvm_flush_remote_tlbs(kvm); +} + +void kvm_mmu_zap_all(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); + + spin_lock(&kvm->mmu_lock); +restart: + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + goto restart; + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); +} + +static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, + struct list_head *invalid_list) +{ + struct kvm_mmu_page *page; + + page = container_of(kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) +static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +#else +static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) +#endif +{ + struct kvm *kvm; + struct kvm *kvm_freed = NULL; + + if (nr_to_scan == 0) + goto out; + + spin_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + int idx, freed_pages; + LIST_HEAD(invalid_list); + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + if (!kvm_freed && nr_to_scan > 0 && + kvm->arch.n_used_mmu_pages > 0) { + freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, + &invalid_list); + kvm_freed = kvm; + } + nr_to_scan--; + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + } + if (kvm_freed) + list_move_tail(&kvm_freed->vm_list, &vm_list); + + spin_unlock(&kvm_lock); + +out: + return percpu_counter_read_positive(&kvm_total_used_mmu_pages); +} + +static struct shrinker mmu_shrinker = { + .shrink = mmu_shrink, + .seeks = DEFAULT_SEEKS * 10, +}; + +static void mmu_destroy_caches(void) +{ + if (pte_chain_cache) + kmem_cache_destroy(pte_chain_cache); + if (rmap_desc_cache) + kmem_cache_destroy(rmap_desc_cache); + if (mmu_page_header_cache) + kmem_cache_destroy(mmu_page_header_cache); +} + +int kvm_mmu_module_init(void) +{ + pte_chain_cache = kmem_cache_create("kvm_pte_chain", + sizeof(struct kvm_pte_chain), + 0, 0, NULL); + if (!pte_chain_cache) + goto nomem; + rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", + sizeof(struct kvm_rmap_desc), + 0, 0, NULL); + if (!rmap_desc_cache) + goto nomem; + + mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", + sizeof(struct kvm_mmu_page), + 0, 0, NULL); + if (!mmu_page_header_cache) + goto nomem; + + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) + goto nomem; + + register_shrinker(&mmu_shrinker); + + return 0; + +nomem: + mmu_destroy_caches(); + return -ENOMEM; +} + +/* + * Caculate mmu pages needed for kvm. + */ +unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) +{ + int i; + unsigned int nr_mmu_pages; + unsigned int nr_pages = 0; + struct kvm_memslots *slots; + + slots = kvm_memslots(kvm); + + for (i = 0; i < slots->nmemslots; i++) + nr_pages += slots->memslots[i].npages; + + nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; + nr_mmu_pages = max(nr_mmu_pages, + (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + + return nr_mmu_pages; +} + +static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer, + unsigned len) +{ + if (len > buffer->len) + return NULL; + return buffer->ptr; +} + +static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer, + unsigned len) +{ + void *ret; + + ret = pv_mmu_peek_buffer(buffer, len); + if (!ret) + return ret; + buffer->ptr += len; + buffer->len -= len; + buffer->processed += len; + return ret; +} + +static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, + gpa_t addr, gpa_t value) +{ + int bytes = 8; + int r; + + if (!is_long_mode(vcpu) && !is_pae(vcpu)) + bytes = 4; + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + if (!emulator_write_phys(vcpu, addr, &value, bytes)) + return -EFAULT; + + return 1; +} + +static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) +{ + (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); + return 1; +} + +static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT); + spin_unlock(&vcpu->kvm->mmu_lock); + return 1; +} + +static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, + struct kvm_pv_mmu_op_buffer *buffer) +{ + struct kvm_mmu_op_header *header; + + header = pv_mmu_peek_buffer(buffer, sizeof *header); + if (!header) + return 0; + switch (header->op) { + case KVM_MMU_OP_WRITE_PTE: { + struct kvm_mmu_op_write_pte *wpte; + + wpte = pv_mmu_read_buffer(buffer, sizeof *wpte); + if (!wpte) + return 0; + return kvm_pv_mmu_write(vcpu, wpte->pte_phys, + wpte->pte_val); + } + case KVM_MMU_OP_FLUSH_TLB: { + struct kvm_mmu_op_flush_tlb *ftlb; + + ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb); + if (!ftlb) + return 0; + return kvm_pv_mmu_flush_tlb(vcpu); + } + case KVM_MMU_OP_RELEASE_PT: { + struct kvm_mmu_op_release_pt *rpt; + + rpt = pv_mmu_read_buffer(buffer, sizeof *rpt); + if (!rpt) + return 0; + return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); + } + default: return 0; + } +} + +int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, + gpa_t addr, unsigned long *ret) +{ + int r; + struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; + + buffer->ptr = buffer->buf; + buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); + buffer->processed = 0; + + r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); + if (r) + goto out; + + while (buffer->len) { + r = kvm_pv_mmu_op_one(vcpu, buffer); + if (r < 0) + goto out; + if (r == 0) + break; + } + + r = 1; +out: + *ret = buffer->processed; + return r; +} + +int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) +{ + struct kvm_shadow_walk_iterator iterator; + int nr_sptes = 0; + + spin_lock(&vcpu->kvm->mmu_lock); + for_each_shadow_entry(vcpu, addr, iterator) { + sptes[iterator.level-1] = *iterator.sptep; + nr_sptes++; + if (!is_shadow_present_pte(*iterator.sptep)) + break; + } + spin_unlock(&vcpu->kvm->mmu_lock); + + return nr_sptes; +} +EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); +} + +#ifdef CONFIG_KVM_MMU_AUDIT +#include "mmu_audit.c" +#else +static void mmu_audit_disable(void) { } +#endif + +void kvm_mmu_module_exit(void) +{ + mmu_destroy_caches(); + percpu_counter_destroy(&kvm_total_used_mmu_pages); + unregister_shrinker(&mmu_shrinker); + mmu_audit_disable(); +} diff --git a/linux/x86/mmu.h b/linux/x86/mmu.h new file mode 100644 index 0000000..7086ca8 --- /dev/null +++ b/linux/x86/mmu.h @@ -0,0 +1,79 @@ +#ifndef __KVM_X86_MMU_H +#define __KVM_X86_MMU_H + +#include <linux/kvm_host.h> +#include "kvm_cache_regs.h" + +#define PT64_PT_BITS 9 +#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) +#define PT32_PT_BITS 10 +#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) + +#define PT_WRITABLE_SHIFT 1 + +#define PT_PRESENT_MASK (1ULL << 0) +#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) +#define PT_USER_MASK (1ULL << 2) +#define PT_PWT_MASK (1ULL << 3) +#define PT_PCD_MASK (1ULL << 4) +#define PT_ACCESSED_SHIFT 5 +#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) +#define PT_DIRTY_MASK (1ULL << 6) +#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAT_MASK (1ULL << 7) +#define PT_GLOBAL_MASK (1ULL << 8) +#define PT64_NX_SHIFT 63 +#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) + +#define PT_PAT_SHIFT 7 +#define PT_DIR_PAT_SHIFT 12 +#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) + +#define PT32_DIR_PSE36_SIZE 4 +#define PT32_DIR_PSE36_SHIFT 13 +#define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) + +#define PT64_ROOT_LEVEL 4 +#define PT32_ROOT_LEVEL 2 +#define PT32E_ROOT_LEVEL 3 + +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + +int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); +int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); + +static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) +{ + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; +} + +static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) + __kvm_mmu_free_some_pages(vcpu); +} + +static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) +{ + if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) + return 0; + + return kvm_mmu_load(vcpu); +} + +static inline int is_present_gpte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + +#endif diff --git a/linux/x86/mmu_audit.c b/linux/x86/mmu_audit.c new file mode 100644 index 0000000..3b2d201 --- /dev/null +++ b/linux/x86/mmu_audit.c @@ -0,0 +1,344 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * mmu_audit.c: + * + * Audit code for KVM MMU + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * Marcelo Tosatti <mtosatti@redhat.com> + * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/ratelimit.h> + +#define audit_printk(kvm, fmt, args...) \ + printk(KERN_ERR "audit: (%s) error: " \ + fmt, audit_point_name[kvm->arch.audit_point], ##args) + +typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); + +static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + inspect_spte_fn fn, int level) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 *ent = sp->spt; + + fn(vcpu, ent + i, level); + + if (is_shadow_present_pte(ent[i]) && + !is_last_spte(ent[i], level)) { + struct kvm_mmu_page *child; + + child = page_header(ent[i] & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(vcpu, child, fn, level - 1); + } + } +} + +static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + sp = page_header(root); + __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); + return; + } + + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + __mmu_spte_walk(vcpu, sp, fn, 2); + } + } + + return; +} + +typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); + +static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) +{ + struct kvm_mmu_page *sp; + + list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) + fn(kvm, sp); +} + +static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + struct kvm_mmu_page *sp; + gfn_t gfn; + pfn_t pfn; + hpa_t hpa; + + sp = page_header(__pa(sptep)); + + if (sp->unsync) { + if (level != PT_PAGE_TABLE_LEVEL) { + audit_printk(vcpu->kvm, "unsync sp: %p " + "level = %d\n", sp, level); + return; + } + + if (*sptep == shadow_notrap_nonpresent_pte) { + audit_printk(vcpu->kvm, "notrap spte in unsync " + "sp: %p\n", sp); + return; + } + } + + if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { + audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n", + sp); + return; + } + + if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) + return; + + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); + + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return; + } + + hpa = pfn << PAGE_SHIFT; + if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) + audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " + "ent %llxn", vcpu->arch.mmu.root_level, pfn, + hpa, *sptep); +} + +static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) +{ + unsigned long *rmapp; + struct kvm_mmu_page *rev_sp; + gfn_t gfn; + + + rev_sp = page_header(__pa(sptep)); + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); + + if (!gfn_to_memslot(kvm, gfn)) { + if (!printk_ratelimit()) + return; + audit_printk(kvm, "no memslot for gfn %llx\n", gfn); + audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", + (long int)(sptep - rev_sp->spt), rev_sp->gfn); + dump_stack(); + return; + } + + rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); + if (!*rmapp) { + if (!printk_ratelimit()) + return; + audit_printk(kvm, "no rmap for writable spte %llx\n", + *sptep); + dump_stack(); + } +} + +static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) + inspect_spte_has_rmap(vcpu->kvm, sptep); +} + +static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) + audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " + "root.\n", sp); +} + +static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + int i; + + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (!is_rmap_spte(sp->spt[i])) + continue; + + inspect_spte_has_rmap(kvm, sp->spt + i); + } +} + +static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct kvm_memory_slot *slot; + unsigned long *rmapp; + u64 *spte; + + if (sp->role.direct || sp->unsync || sp->role.invalid) + return; + + slot = gfn_to_memslot(kvm, sp->gfn); + rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + if (is_writable_pte(*spte)) + audit_printk(kvm, "shadow page has writable " + "mappings: gfn %llx role %x\n", + sp->gfn, sp->role.word); + spte = rmap_next(kvm, rmapp, spte); + } +} + +static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + check_mappings_rmap(kvm, sp); + audit_write_protection(kvm, sp); +} + +static void audit_all_active_sps(struct kvm *kvm) +{ + walk_all_active_sps(kvm, audit_sp); +} + +static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + audit_sptes_have_rmaps(vcpu, sptep, level); + audit_mappings(vcpu, sptep, level); + audit_spte_after_sync(vcpu, sptep, level); +} + +static void audit_vcpu_spte(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, audit_spte); +} + +static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) +{ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + + if (!__ratelimit(&ratelimit_state)) + return; + + vcpu->kvm->arch.audit_point = point; + audit_all_active_sps(vcpu->kvm); + audit_vcpu_spte(vcpu); +} + +static bool mmu_audit; + +static void mmu_audit_enable(void) +{ + int ret; + + if (mmu_audit) + return; + + ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); + WARN_ON(ret); + + mmu_audit = true; +} + +static void mmu_audit_disable(void) +{ + if (!mmu_audit) + return; + + unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); + tracepoint_synchronize_unregister(); + mmu_audit = false; +} + +static int mmu_audit_set(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned long enable; + + ret = strict_strtoul(val, 10, &enable); + if (ret < 0) + return -EINVAL; + + switch (enable) { + case 0: + mmu_audit_disable(); + break; + case 1: + mmu_audit_enable(); + break; + default: + return -EINVAL; + } + + return 0; +} + +static struct kernel_param_ops audit_param_ops = { + .set = mmu_audit_set, + .get = param_get_bool, +}; + +module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/linux/x86/mmutrace.h b/linux/x86/mmutrace.h new file mode 100644 index 0000000..b60b4fd --- /dev/null +++ b/linux/x86/mmutrace.h @@ -0,0 +1,225 @@ +#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVMMMU_H + +#include <linux/tracepoint.h> +#include <linux/ftrace_event.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvmmmu + +#define KVM_MMU_PAGE_FIELDS \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ + __field(bool, unsync) + +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ + __entry->unsync = sp->unsync; + +#define KVM_MMU_PAGE_PRINTK() ({ \ + const char *ret = p->buffer + p->len; \ + static const char *access_str[] = { \ + "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ + }; \ + union kvm_mmu_page_role role; \ + \ + role.word = __entry->role; \ + \ + trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ + " %snxe root %u %s%c", \ + __entry->gfn, role.level, \ + role.cr4_pae ? " pae" : "", \ + role.quadrant, \ + role.direct ? " direct" : "", \ + access_str[role.access], \ + role.invalid ? " invalid" : "", \ + role.nxe ? "" : "!", \ + __entry->root_count, \ + __entry->unsync ? "unsync" : "sync", 0); \ + ret; \ + }) + +#define kvm_mmu_trace_pferr_flags \ + { PFERR_PRESENT_MASK, "P" }, \ + { PFERR_WRITE_MASK, "W" }, \ + { PFERR_USER_MASK, "U" }, \ + { PFERR_RSVD_MASK, "RSVD" }, \ + { PFERR_FETCH_MASK, "F" } + +/* + * A pagetable walk has started + */ +TRACE_EVENT( + kvm_mmu_pagetable_walk, + TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), + TP_ARGS(addr, write_fault, user_fault, fetch_fault), + + TP_STRUCT__entry( + __field(__u64, addr) + __field(__u32, pferr) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) + | (!!fetch_fault << 4); + ), + + TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, + __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) +); + + +/* We just walked a paging element */ +TRACE_EVENT( + kvm_mmu_paging_element, + TP_PROTO(u64 pte, int level), + TP_ARGS(pte, level), + + TP_STRUCT__entry( + __field(__u64, pte) + __field(__u32, level) + ), + + TP_fast_assign( + __entry->pte = pte; + __entry->level = level; + ), + + TP_printk("pte %llx level %u", __entry->pte, __entry->level) +); + +DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class, + + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + + TP_ARGS(table_gfn, index, size), + + TP_STRUCT__entry( + __field(__u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) + + index * size; + ), + + TP_printk("gpa %llx", __entry->gpa) +); + +/* We set a pte accessed bit */ +DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit, + + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + + TP_ARGS(table_gfn, index, size) +); + +/* We set a pte dirty bit */ +DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit, + + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + + TP_ARGS(table_gfn, index, size) +); + +TRACE_EVENT( + kvm_mmu_walker_error, + TP_PROTO(u32 pferr), + TP_ARGS(pferr), + + TP_STRUCT__entry( + __field(__u32, pferr) + ), + + TP_fast_assign( + __entry->pferr = pferr; + ), + + TP_printk("pferr %x %s", __entry->pferr, + __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) +); + +TRACE_EVENT( + kvm_mmu_get_page, + TP_PROTO(struct kvm_mmu_page *sp, bool created), + TP_ARGS(sp, created), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + __field(bool, created) + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + __entry->created = created; + ), + + TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(), + __entry->created ? "new" : "existing") +); + +DECLARE_EVENT_CLASS(kvm_mmu_page_class, + + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page, + TP_PROTO(struct kvm_mmu_page *sp), + + TP_ARGS(sp) +); + +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, + TP_PROTO(struct kvm_mmu_page *sp), + + TP_ARGS(sp) +); + +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, + TP_PROTO(struct kvm_mmu_page *sp), + + TP_ARGS(sp) +); + +TRACE_EVENT( + kvm_mmu_audit, + TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), + TP_ARGS(vcpu, audit_point), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(int, audit_point) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->audit_point = audit_point; + ), + + TP_printk("vcpu:%d %s", __entry->vcpu->cpu, + audit_point_name[__entry->audit_point]) +); +#endif /* _TRACE_KVMMMU_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE mmutrace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/linux/x86/paging_tmpl.h b/linux/x86/paging_tmpl.h new file mode 100644 index 0000000..6bccc24 --- /dev/null +++ b/linux/x86/paging_tmpl.h @@ -0,0 +1,839 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +/* + * We need the mmu code to access both 32-bit and 64-bit guest ptes, + * so the code in this file is compiled twice, once per pte size. + */ + +#if PTTYPE == 64 + #define pt_element_t u64 + #define guest_walker guest_walker64 + #define FNAME(name) paging##64_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #ifdef CONFIG_X86_64 + #define PT_MAX_FULL_LEVELS 4 + #define CMPXCHG cmpxchg + #else + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 2 + #endif +#elif PTTYPE == 32 + #define pt_element_t u32 + #define guest_walker guest_walker32 + #define FNAME(name) paging##32_##name + #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT32_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) + #define PT_LEVEL_BITS PT32_LEVEL_BITS + #define PT_MAX_FULL_LEVELS 2 + #define CMPXCHG cmpxchg +#else + #error Invalid PTTYPE value +#endif + +#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) +#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) + +/* + * The guest_walker structure emulates the behavior of the hardware page + * table walker. + */ +struct guest_walker { + int level; + gfn_t table_gfn[PT_MAX_FULL_LEVELS]; + pt_element_t ptes[PT_MAX_FULL_LEVELS]; + pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; + gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + unsigned pt_access; + unsigned pte_access; + gfn_t gfn; + struct x86_exception fault; +}; + +static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) +{ + return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; +} + +static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, + gfn_t table_gfn, unsigned index, + pt_element_t orig_pte, pt_element_t new_pte) +{ + pt_element_t ret; + pt_element_t *table; + struct page *page; + + page = gfn_to_page(kvm, table_gfn); + + table = kmap_atomic(page, KM_USER0); + ret = CMPXCHG(&table[index], orig_pte, new_pte); + kunmap_atomic(table, KM_USER0); + + kvm_release_page_dirty(page); + + return (ret != orig_pte); +} + +static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) +{ + unsigned access; + + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; +#if PTTYPE == 64 + if (vcpu->arch.mmu.nx) + access &= ~(gpte >> PT64_NX_SHIFT); +#endif + return access; +} + +/* + * Fetch a guest pte for a guest virtual address + */ +static int FNAME(walk_addr_generic)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t addr, u32 access) +{ + pt_element_t pte; + gfn_t table_gfn; + unsigned index, pt_access, uninitialized_var(pte_access); + gpa_t pte_gpa; + bool eperm, present, rsvd_fault; + int offset, write_fault, user_fault, fetch_fault; + + write_fault = access & PFERR_WRITE_MASK; + user_fault = access & PFERR_USER_MASK; + fetch_fault = access & PFERR_FETCH_MASK; + + trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, + fetch_fault); +walk: + present = true; + eperm = rsvd_fault = false; + walker->level = mmu->root_level; + pte = mmu->get_cr3(vcpu); + +#if PTTYPE == 64 + if (walker->level == PT32E_ROOT_LEVEL) { + pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); + trace_kvm_mmu_paging_element(pte, walker->level); + if (!is_present_gpte(pte)) { + present = false; + goto error; + } + --walker->level; + } +#endif + ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || + (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); + + pt_access = ACC_ALL; + + for (;;) { + index = PT_INDEX(addr, walker->level); + + table_gfn = gpte_to_gfn(pte); + offset = index * sizeof(pt_element_t); + pte_gpa = gfn_to_gpa(table_gfn) + offset; + walker->table_gfn[walker->level - 1] = table_gfn; + walker->pte_gpa[walker->level - 1] = pte_gpa; + + if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, + offset, sizeof(pte), + PFERR_USER_MASK|PFERR_WRITE_MASK)) { + present = false; + break; + } + + trace_kvm_mmu_paging_element(pte, walker->level); + + if (!is_present_gpte(pte)) { + present = false; + break; + } + + if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { + rsvd_fault = true; + break; + } + + if (write_fault && !is_writable_pte(pte)) + if (user_fault || is_write_protection(vcpu)) + eperm = true; + + if (user_fault && !(pte & PT_USER_MASK)) + eperm = true; + +#if PTTYPE == 64 + if (fetch_fault && (pte & PT64_NX_MASK)) + eperm = true; +#endif + + if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { + trace_kvm_mmu_set_accessed_bit(table_gfn, index, + sizeof(pte)); + if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, + index, pte, pte|PT_ACCESSED_MASK)) + goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); + pte |= PT_ACCESSED_MASK; + } + + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); + + walker->ptes[walker->level - 1] = pte; + + if ((walker->level == PT_PAGE_TABLE_LEVEL) || + ((walker->level == PT_DIRECTORY_LEVEL) && + is_large_pte(pte) && + (PTTYPE == 64 || is_pse(vcpu))) || + ((walker->level == PT_PDPE_LEVEL) && + is_large_pte(pte) && + mmu->root_level == PT64_ROOT_LEVEL)) { + int lvl = walker->level; + gpa_t real_gpa; + gfn_t gfn; + u32 ac; + + gfn = gpte_to_gfn_lvl(pte, lvl); + gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; + + if (PTTYPE == 32 && + walker->level == PT_DIRECTORY_LEVEL && + is_cpuid_PSE36()) + gfn += pse36_gfn_delta(pte); + + ac = write_fault | fetch_fault | user_fault; + + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), + ac); + if (real_gpa == UNMAPPED_GVA) + return 0; + + walker->gfn = real_gpa >> PAGE_SHIFT; + + break; + } + + pt_access = pte_access; + --walker->level; + } + + if (!present || eperm || rsvd_fault) + goto error; + + if (write_fault && !is_dirty_gpte(pte)) { + bool ret; + + trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); + ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, + pte|PT_DIRTY_MASK); + if (ret) + goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); + pte |= PT_DIRTY_MASK; + walker->ptes[walker->level - 1] = pte; + } + + walker->pt_access = pt_access; + walker->pte_access = pte_access; + pgprintk("%s: pte %llx pte_access %x pt_access %x\n", + __func__, (u64)pte, pte_access, pt_access); + return 1; + +error: + walker->fault.vector = PF_VECTOR; + walker->fault.error_code_valid = true; + walker->fault.error_code = 0; + if (present) + walker->fault.error_code |= PFERR_PRESENT_MASK; + + walker->fault.error_code |= write_fault | user_fault; + + if (fetch_fault && mmu->nx) + walker->fault.error_code |= PFERR_FETCH_MASK; + if (rsvd_fault) + walker->fault.error_code |= PFERR_RSVD_MASK; + + walker->fault.address = addr; + walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; + + trace_kvm_mmu_walker_error(walker->fault.error_code); + return 0; +} + +static int FNAME(walk_addr)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, u32 access) +{ + return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, + access); +} + +static int FNAME(walk_addr_nested)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, + u32 access) +{ + return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, + addr, access); +} + +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + pt_element_t gpte) +{ + u64 nonpresent = shadow_trap_nonpresent_pte; + + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!is_present_gpte(gpte)) { + if (!sp->unsync) + nonpresent = shadow_notrap_nonpresent_pte; + goto no_present; + } + + if (!(gpte & PT_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte, nonpresent); + return true; +} + +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, const void *pte) +{ + pt_element_t gpte; + unsigned pte_access; + pfn_t pfn; + + gpte = *(const pt_element_t *)pte; + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) + return; + + pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) + return; + pfn = vcpu->arch.update_pte.pfn; + if (is_error_pfn(pfn)) + return; + if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) + return; + kvm_get_pfn(pfn); + /* + * we call mmu_set_spte() with host_writable = true beacuse that + * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). + */ + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, + is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, + gpte_to_gfn(gpte), pfn, true, true); +} + +static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, + struct guest_walker *gw, int level) +{ + pt_element_t curr_pte; + gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; + u64 mask; + int r, index; + + if (level == PT_PAGE_TABLE_LEVEL) { + mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; + base_gpa = pte_gpa & ~mask; + index = (pte_gpa - base_gpa) / sizeof(pt_element_t); + + r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, + gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); + curr_pte = gw->prefetch_ptes[index]; + } else + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, + &curr_pte, sizeof(curr_pte)); + + return r || curr_pte != gw->ptes[level - 1]; +} + +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, + u64 *sptep) +{ + struct kvm_mmu_page *sp; + pt_element_t *gptep = gw->prefetch_ptes; + u64 *spte; + int i; + + sp = page_header(__pa(sptep)); + + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return; + + if (sp->role.direct) + return __direct_pte_prefetch(vcpu, sp, sptep); + + i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + spte = sp->spt + i; + + for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { + pt_element_t gpte; + unsigned pte_access; + gfn_t gfn; + pfn_t pfn; + bool dirty; + + if (spte == sptep) + continue; + + if (*spte != shadow_trap_nonpresent_pte) + continue; + + gpte = gptep[i]; + + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) + continue; + + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + gfn = gpte_to_gfn(gpte); + dirty = is_dirty_gpte(gpte); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + (pte_access & ACC_WRITE_MASK) && dirty); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + break; + } + + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, + dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, + pfn, true, true); + } +} + +/* + * Fetch a shadow pte for a specific level in the paging hierarchy. + */ +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *gw, + int user_fault, int write_fault, int hlevel, + int *ptwrite, pfn_t pfn, bool map_writable, + bool prefault) +{ + unsigned access = gw->pt_access; + struct kvm_mmu_page *sp = NULL; + bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); + int top_level; + unsigned direct_access; + struct kvm_shadow_walk_iterator it; + + if (!is_present_gpte(gw->ptes[gw->level - 1])) + return NULL; + + direct_access = gw->pt_access & gw->pte_access; + if (!dirty) + direct_access &= ~ACC_WRITE_MASK; + + top_level = vcpu->arch.mmu.root_level; + if (top_level == PT32E_ROOT_LEVEL) + top_level = PT32_ROOT_LEVEL; + /* + * Verify that the top-level gpte is still there. Since the page + * is a root page, it is either write protected (and cannot be + * changed from now on) or it is invalid (in which case, we don't + * really care if it changes underneath us after this point). + */ + if (FNAME(gpte_changed)(vcpu, gw, top_level)) + goto out_gpte_changed; + + for (shadow_walk_init(&it, vcpu, addr); + shadow_walk_okay(&it) && it.level > gw->level; + shadow_walk_next(&it)) { + gfn_t table_gfn; + + drop_large_spte(vcpu, it.sptep); + + sp = NULL; + if (!is_shadow_present_pte(*it.sptep)) { + table_gfn = gw->table_gfn[it.level - 2]; + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, + false, access, it.sptep); + } + + /* + * Verify that the gpte in the page we've just write + * protected is still there. + */ + if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) + goto out_gpte_changed; + + if (sp) + link_shadow_page(it.sptep, sp); + } + + for (; + shadow_walk_okay(&it) && it.level > hlevel; + shadow_walk_next(&it)) { + gfn_t direct_gfn; + + validate_direct_spte(vcpu, it.sptep, direct_access); + + drop_large_spte(vcpu, it.sptep); + + if (is_shadow_present_pte(*it.sptep)) + continue; + + direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + + sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, + true, direct_access, it.sptep); + link_shadow_page(it.sptep, sp); + } + + mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, + user_fault, write_fault, dirty, ptwrite, it.level, + gw->gfn, pfn, prefault, map_writable); + FNAME(pte_prefetch)(vcpu, gw, it.sptep); + + return it.sptep; + +out_gpte_changed: + if (sp) + kvm_mmu_put_page(sp, it.sptep); + kvm_release_pfn_clean(pfn); + return NULL; +} + +/* + * Page fault handler. There are several causes for a page fault: + * - there is no shadow pte for the guest pte + * - write access through a shadow pte marked read only so that we can set + * the dirty bit + * - write access to a shadow pte marked read only so we can update the page + * dirty bitmap, when userspace requests it + * - mmio access; in this case we will never install a present shadow pte + * - normal guest page fault due to the guest pte marked not present, not + * writable, or not executable + * + * Returns: 1 if we need to emulate the instruction, 0 otherwise, or + * a negative value on error. + */ +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + bool prefault) +{ + int write_fault = error_code & PFERR_WRITE_MASK; + int user_fault = error_code & PFERR_USER_MASK; + struct guest_walker walker; + u64 *sptep; + int write_pt = 0; + int r; + pfn_t pfn; + int level = PT_PAGE_TABLE_LEVEL; + int force_pt_level; + unsigned long mmu_seq; + bool map_writable; + + pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + /* + * Look up the guest pte for the faulting address. + */ + r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); + + /* + * The page is not mapped by the guest. Let the guest handle it. + */ + if (!r) { + pgprintk("%s: guest page fault\n", __func__); + if (!prefault) { + inject_page_fault(vcpu, &walker.fault); + /* reset fork detector */ + vcpu->arch.last_pt_write_count = 0; + } + return 0; + } + + if (walker.level >= PT_DIRECTORY_LEVEL) + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + else + force_pt_level = 1; + if (!force_pt_level) { + level = min(walker.level, mapping_level(vcpu, walker.gfn)); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + } + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, + &map_writable)) + return 0; + + /* mmio */ + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); + + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; + + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); + kvm_mmu_free_some_pages(vcpu); + if (!force_pt_level) + transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); + sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + level, &write_pt, pfn, map_writable, prefault); + (void)sptep; + pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, + sptep, *sptep, write_pt); + + if (!write_pt) + vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ + + ++vcpu->stat.pf_fixed; + trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); + spin_unlock(&vcpu->kvm->mmu_lock); + + return write_pt; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; +} + +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp; + gpa_t pte_gpa = -1; + int level; + u64 *sptep; + int need_flush = 0; + + spin_lock(&vcpu->kvm->mmu_lock); + + for_each_shadow_entry(vcpu, gva, iterator) { + level = iterator.level; + sptep = iterator.sptep; + + sp = page_header(__pa(sptep)); + if (is_last_spte(*sptep, level)) { + int offset, shift; + + if (!sp->unsync) + break; + + shift = PAGE_SHIFT - + (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; + offset = sp->role.quadrant << shift; + + pte_gpa = (sp->gfn << PAGE_SHIFT) + offset; + pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + + if (is_shadow_present_pte(*sptep)) { + if (is_large_pte(*sptep)) + --vcpu->kvm->stat.lpages; + drop_spte(vcpu->kvm, sptep, + shadow_trap_nonpresent_pte); + need_flush = 1; + } else + __set_spte(sptep, shadow_trap_nonpresent_pte); + break; + } + + if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) + break; + } + + if (need_flush) + kvm_flush_remote_tlbs(vcpu->kvm); + + atomic_inc(&vcpu->kvm->arch.invlpg_counter); + + spin_unlock(&vcpu->kvm->mmu_lock); + + if (pte_gpa == -1) + return; + + if (mmu_topup_memory_caches(vcpu)) + return; + kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0); +} + +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, + struct x86_exception *exception) +{ + struct guest_walker walker; + gpa_t gpa = UNMAPPED_GVA; + int r; + + r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); + + if (r) { + gpa = gfn_to_gpa(walker.gfn); + gpa |= vaddr & ~PAGE_MASK; + } else if (exception) + *exception = walker.fault; + + return gpa; +} + +static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, + struct x86_exception *exception) +{ + struct guest_walker walker; + gpa_t gpa = UNMAPPED_GVA; + int r; + + r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); + + if (r) { + gpa = gfn_to_gpa(walker.gfn); + gpa |= vaddr & ~PAGE_MASK; + } else if (exception) + *exception = walker.fault; + + return gpa; +} + +static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i, j, offset, r; + pt_element_t pt[256 / sizeof(pt_element_t)]; + gpa_t pte_gpa; + + if (sp->role.direct + || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { + nonpaging_prefetch_page(vcpu, sp); + return; + } + + pte_gpa = gfn_to_gpa(sp->gfn); + if (PTTYPE == 32) { + offset = sp->role.quadrant << PT64_LEVEL_BITS; + pte_gpa += offset * sizeof(pt_element_t); + } + + for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) { + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); + pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); + for (j = 0; j < ARRAY_SIZE(pt); ++j) + if (r || is_present_gpte(pt[j])) + sp->spt[i+j] = shadow_trap_nonpresent_pte; + else + sp->spt[i+j] = shadow_notrap_nonpresent_pte; + } +} + +/* + * Using the cached information from sp->gfns is safe because: + * - The spte has a reference to the struct page, so the pfn for a given gfn + * can't change unless all sptes pointing to it are nuked first. + * + * Note: + * We should flush all tlbs if spte is dropped even though guest is + * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page + * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't + * used by guest then tlbs are not flushed, so guest is allowed to access the + * freed pages. + * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. + */ +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + int i, offset, nr_present; + bool host_writable; + gpa_t first_pte_gpa; + + offset = nr_present = 0; + + /* direct kvm_mmu_page can not be unsync. */ + BUG_ON(sp->role.direct); + + if (PTTYPE == 32) + offset = sp->role.quadrant << PT64_LEVEL_BITS; + + first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn; + + if (!is_shadow_present_pte(sp->spt[i])) + continue; + + pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); + + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -EINVAL; + + gfn = gpte_to_gfn(gpte); + + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { + vcpu->kvm->tlbs_dirty++; + continue; + } + + if (gfn != sp->gfns[i]) { + drop_spte(vcpu->kvm, &sp->spt[i], + shadow_trap_nonpresent_pte); + vcpu->kvm->tlbs_dirty++; + continue; + } + + nr_present++; + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; + + set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, + is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, + spte_to_pfn(sp->spt[i]), true, false, + host_writable); + } + + return !nr_present; +} + +#undef pt_element_t +#undef guest_walker +#undef FNAME +#undef PT_BASE_ADDR_MASK +#undef PT_INDEX +#undef PT_LEVEL_MASK +#undef PT_LVL_ADDR_MASK +#undef PT_LVL_OFFSET_MASK +#undef PT_LEVEL_BITS +#undef PT_MAX_FULL_LEVELS +#undef gpte_to_gfn +#undef gpte_to_gfn_lvl +#undef CMPXCHG diff --git a/linux/x86/svm.c b/linux/x86/svm.c new file mode 100644 index 0000000..bd6a099 --- /dev/null +++ b/linux/x86/svm.c @@ -0,0 +1,3993 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * Kernel-based Virtual Machine driver for Linux + * + * AMD SVM support + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ +#include <linux/kvm_host.h> + +#include "irq.h" +#include "mmu.h" +#include "kvm_cache_regs.h" +#include "x86.h" + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <linux/sched.h> +#include <linux/ftrace_event.h> +#include <linux/slab.h> + +#include <asm/tlbflush.h> +#include <asm/desc.h> +#include <asm/kvm_para.h> + +#include <asm/virtext.h> +#include "trace.h" + +#define __ex(x) __kvm_handle_fault_on_reboot(x) + +MODULE_INFO(version, "kvm-kmod-2.6.38-rc7"); +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +#define IOPM_ALLOC_ORDER 2 +#define MSRPM_ALLOC_ORDER 1 + +#define SEG_TYPE_LDT 2 +#define SEG_TYPE_BUSY_TSS16 3 + +#define SVM_FEATURE_NPT (1 << 0) +#define SVM_FEATURE_LBRV (1 << 1) +#define SVM_FEATURE_SVML (1 << 2) +#define SVM_FEATURE_NRIP (1 << 3) +#define SVM_FEATURE_TSC_RATE (1 << 4) +#define SVM_FEATURE_VMCB_CLEAN (1 << 5) +#define SVM_FEATURE_FLUSH_ASID (1 << 6) +#define SVM_FEATURE_DECODE_ASSIST (1 << 7) +#define SVM_FEATURE_PAUSE_FILTER (1 << 10) + +#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ +#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ +#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ + +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) + +static bool erratum_383_found __read_mostly; + +static const u32 host_save_user_msrs[] = { +#ifdef CONFIG_X86_64 + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, + MSR_FS_BASE, +#endif + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, +}; + +#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) + +struct kvm_vcpu; + +struct nested_state { + struct vmcb *hsave; + u64 hsave_msr; + u64 vm_cr_msr; + u64 vmcb; + + /* These are the merged vectors */ + u32 *msrpm; + + /* gpa pointers to the real vectors */ + u64 vmcb_msrpm; + u64 vmcb_iopm; + + /* A VMEXIT is required but not yet emulated */ + bool exit_required; + + /* + * If we vmexit during an instruction emulation we need this to restore + * the l1 guest rip after the emulation + */ + unsigned long vmexit_rip; + unsigned long vmexit_rsp; + unsigned long vmexit_rax; + + /* cache for intercepts of the guest */ + u32 intercept_cr; + u32 intercept_dr; + u32 intercept_exceptions; + u64 intercept; + + /* Nested Paging related state */ + u64 nested_cr3; +}; + +#define MSRPM_OFFSETS 16 +static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; + +struct vcpu_svm { + struct kvm_vcpu vcpu; + struct vmcb *vmcb; + unsigned long vmcb_pa; + struct svm_cpu_data *svm_data; + uint64_t asid_generation; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + + u64 next_rip; + + u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; + struct { + u16 fs; + u16 gs; + u16 ldt; + u64 gs_base; + } host; + + u32 *msrpm; + + struct nested_state nested; + + bool nmi_singlestep; + + unsigned int3_injected; + unsigned long int3_rip; + u32 apf_reason; +}; + +#define MSR_INVALID 0xffffffffU + +static struct svm_direct_access_msrs { + u32 index; /* Index of the MSR */ + bool always; /* True if intercept is always on */ +} direct_access_msrs[] = { + { .index = MSR_STAR, .always = true }, + { .index = MSR_IA32_SYSENTER_CS, .always = true }, +#ifdef CONFIG_X86_64 + { .index = MSR_GS_BASE, .always = true }, + { .index = MSR_FS_BASE, .always = true }, + { .index = MSR_KERNEL_GS_BASE, .always = true }, + { .index = MSR_LSTAR, .always = true }, + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, +#endif + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, + { .index = MSR_IA32_LASTINTFROMIP, .always = false }, + { .index = MSR_IA32_LASTINTTOIP, .always = false }, + { .index = MSR_INVALID, .always = false }, +}; + +/* enable NPT for AMD64 and X86 with PAE */ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static bool npt_enabled = true; +#else +static bool npt_enabled; +#endif +static int npt = 1; + +module_param(npt, int, S_IRUGO); + +static int nested = 1; +module_param(nested, int, S_IRUGO); + +static void svm_flush_tlb(struct kvm_vcpu *vcpu); +static void svm_complete_interrupts(struct vcpu_svm *svm); + +static int nested_svm_exit_handled(struct vcpu_svm *svm); +static int nested_svm_intercept(struct vcpu_svm *svm); +static int nested_svm_vmexit(struct vcpu_svm *svm); +static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, + bool has_error_code, u32 error_code); + +enum { + VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, + pause filter count */ + VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ + VMCB_ASID, /* ASID */ + VMCB_INTR, /* int_ctl, int_vector */ + VMCB_NPT, /* npt_en, nCR3, gPAT */ + VMCB_CR, /* CR0, CR3, CR4, EFER */ + VMCB_DR, /* DR6, DR7 */ + VMCB_DT, /* GDT, IDT */ + VMCB_SEG, /* CS, DS, SS, ES, CPL */ + VMCB_CR2, /* CR2 only */ + VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ + VMCB_DIRTY_MAX, +}; + +/* TPR and CR2 are always written before VMRUN */ +#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) + +static inline void mark_all_dirty(struct vmcb *vmcb) +{ + vmcb->control.clean = 0; +} + +static inline void mark_all_clean(struct vmcb *vmcb) +{ + vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) + & ~VMCB_ALWAYS_DIRTY_MASK; +} + +static inline void mark_dirty(struct vmcb *vmcb, int bit) +{ + vmcb->control.clean &= ~(1 << bit); +} + +static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_svm, vcpu); +} + +static void recalc_intercepts(struct vcpu_svm *svm) +{ + struct vmcb_control_area *c, *h; + struct nested_state *g; + + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + + if (!is_guest_mode(&svm->vcpu)) + return; + + c = &svm->vmcb->control; + h = &svm->nested.hsave->control; + g = &svm->nested; + + c->intercept_cr = h->intercept_cr | g->intercept_cr; + c->intercept_dr = h->intercept_dr | g->intercept_dr; + c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; + c->intercept = h->intercept | g->intercept; +} + +static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) +{ + if (is_guest_mode(&svm->vcpu)) + return svm->nested.hsave; + else + return svm->vmcb; +} + +static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_cr |= (1U << bit); + + recalc_intercepts(svm); +} + +static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_cr &= ~(1U << bit); + + recalc_intercepts(svm); +} + +static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + return vmcb->control.intercept_cr & (1U << bit); +} + +static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_dr |= (1U << bit); + + recalc_intercepts(svm); +} + +static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_dr &= ~(1U << bit); + + recalc_intercepts(svm); +} + +static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_exceptions |= (1U << bit); + + recalc_intercepts(svm); +} + +static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_exceptions &= ~(1U << bit); + + recalc_intercepts(svm); +} + +static inline void set_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept |= (1ULL << bit); + + recalc_intercepts(svm); +} + +static inline void clr_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept &= ~(1ULL << bit); + + recalc_intercepts(svm); +} + +static inline void enable_gif(struct vcpu_svm *svm) +{ + svm->vcpu.arch.hflags |= HF_GIF_MASK; +} + +static inline void disable_gif(struct vcpu_svm *svm) +{ + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; +} + +static inline bool gif_set(struct vcpu_svm *svm) +{ + return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); +} + +static unsigned long iopm_base; + +struct kvm_ldttss_desc { + u16 limit0; + u16 base0; + unsigned base1:8, type:5, dpl:2, p:1; + unsigned limit1:4, zero0:3, g:1, base2:8; + u32 base3; + u32 zero1; +} __attribute__((packed)); + +struct svm_cpu_data { + int cpu; + + u64 asid_generation; + u32 max_asid; + u32 next_asid; + struct kvm_ldttss_desc *tss_desc; + + struct page *save_area; +}; + +static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +static uint32_t svm_features; + +struct svm_init_data { + int cpu; + int r; +}; + +static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; + +#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) +#define MSRS_RANGE_SIZE 2048 +#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) + +static u32 svm_msrpm_offset(u32 msr) +{ + u32 offset; + int i; + + for (i = 0; i < NUM_MSR_MAPS; i++) { + if (msr < msrpm_ranges[i] || + msr >= msrpm_ranges[i] + MSRS_IN_RANGE) + continue; + + offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ + offset += (i * MSRS_RANGE_SIZE); /* add range offset */ + + /* Now we have the u8 offset - but need the u32 offset */ + return offset / 4; + } + + /* MSR not in any range */ + return MSR_INVALID; +} + +#define MAX_INST_SIZE 15 + +static inline void clgi(void) +{ + asm volatile (__ex(SVM_CLGI)); +} + +static inline void stgi(void) +{ + asm volatile (__ex(SVM_STGI)); +} + +static inline void invlpga(unsigned long addr, u32 asid) +{ + asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); +} + +static int get_npt_level(void) +{ +#ifdef CONFIG_X86_64 + return PT64_ROOT_LEVEL; +#else + return PT32E_ROOT_LEVEL; +#endif +} + +static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + vcpu->arch.efer = efer; + if (!npt_enabled && !(efer & EFER_LMA)) + efer &= ~EFER_LME; + + to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; + mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); +} + +static int is_external_interrupt(u32 info) +{ + info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; + return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); +} + +static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u32 ret = 0; + + if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) + ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; + return ret & mask; +} + +static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (mask == 0) + svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; + else + svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; + +} + +static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (svm->vmcb->control.next_rip != 0) + svm->next_rip = svm->vmcb->control.next_rip; + + if (!svm->next_rip) { + if (emulate_instruction(vcpu, EMULTYPE_SKIP) != + EMULATE_DONE) + printk(KERN_DEBUG "%s: NOP\n", __func__); + return; + } + if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) + printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", + __func__, kvm_rip_read(vcpu), svm->next_rip); + + kvm_rip_write(vcpu, svm->next_rip); + svm_set_interrupt_shadow(vcpu, 0); +} + +static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code, + bool reinject) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If we are within a nested VM we'd better #VMEXIT and let the guest + * handle the exception + */ + if (!reinject && + nested_svm_check_exception(svm, nr, has_error_code, error_code)) + return; + + if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { + unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); + + /* + * For guest debugging where we have to reinject #BP if some + * INT3 is guest-owned: + * Emulate nRIP by moving RIP forward. Will fail if injection + * raises a fault that is not intercepted. Still better than + * failing in all cases. + */ + skip_emulated_instruction(&svm->vcpu); + rip = kvm_rip_read(&svm->vcpu); + svm->int3_rip = rip + svm->vmcb->save.cs.base; + svm->int3_injected = rip - old_rip; + } + + svm->vmcb->control.event_inj = nr + | SVM_EVTINJ_VALID + | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) + | SVM_EVTINJ_TYPE_EXEPT; + svm->vmcb->control.event_inj_err = error_code; +} + +static void svm_init_erratum_383(void) +{ + u32 low, high; + int err; + u64 val; + + if (!kvm_cpu_has_amd_erratum(kvm_amd_erratum_383)) + return; + + /* Use _safe variants to not break nested virtualization */ + val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); + if (err) + return; + + val |= (1ULL << 47); + + low = lower_32_bits(val); + high = upper_32_bits(val); + + kvm_native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + + erratum_383_found = true; +} + +static int has_svm(void) +{ + const char *msg; + + if (!cpu_has_svm(&msg)) { + printk(KERN_INFO "has_svm: %s\n", msg); + return 0; + } + + return 1; +} + +static void svm_hardware_disable(void *garbage) +{ + cpu_svm_disable(); +} + +static int svm_hardware_enable(void *garbage) +{ + + struct svm_cpu_data *sd; + uint64_t efer; + struct kvm_desc_ptr gdt_descr; + struct kvm_desc_struct *gdt; + int me = raw_smp_processor_id(); + + rdmsrl(MSR_EFER, efer); + if (efer & EFER_SVME) + return -EBUSY; + + if (!has_svm()) { + printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", + me); + return -EINVAL; + } + sd = per_cpu(svm_data, me); + + if (!sd) { + printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", + me); + return -EINVAL; + } + + sd->asid_generation = 1; + sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; + sd->next_asid = sd->max_asid + 1; + + kvm_native_store_gdt(&gdt_descr); + gdt = (struct kvm_desc_struct *)gdt_descr.address; + sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); + + wrmsrl(MSR_EFER, efer | EFER_SVME); + + wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); + + svm_init_erratum_383(); + + return 0; +} + +static void svm_cpu_uninit(int cpu) +{ + struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id()); + + if (!sd) + return; + + per_cpu(svm_data, raw_smp_processor_id()) = NULL; + __free_page(sd->save_area); + kfree(sd); +} + +static int svm_cpu_init(int cpu) +{ + struct svm_cpu_data *sd; + int r; + + sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); + if (!sd) + return -ENOMEM; + sd->cpu = cpu; + sd->save_area = alloc_page(GFP_KERNEL); + r = -ENOMEM; + if (!sd->save_area) + goto err_1; + + per_cpu(svm_data, cpu) = sd; + + return 0; + +err_1: + kfree(sd); + return r; + +} + +static bool valid_msr_intercept(u32 index) +{ + int i; + + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) + if (direct_access_msrs[i].index == index) + return true; + + return false; +} + +static void set_msr_interception(u32 *msrpm, unsigned msr, + int read, int write) +{ + u8 bit_read, bit_write; + unsigned long tmp; + u32 offset; + + /* + * If this warning triggers extend the direct_access_msrs list at the + * beginning of the file + */ + WARN_ON(!valid_msr_intercept(msr)); + + offset = svm_msrpm_offset(msr); + bit_read = 2 * (msr & 0x0f); + bit_write = 2 * (msr & 0x0f) + 1; + tmp = msrpm[offset]; + + BUG_ON(offset == MSR_INVALID); + + read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); + write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); + + msrpm[offset] = tmp; +} + +static void svm_vcpu_init_msrpm(u32 *msrpm) +{ + int i; + + memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { + if (!direct_access_msrs[i].always) + continue; + + set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); + } +} + +static void add_msr_offset(u32 offset) +{ + int i; + + for (i = 0; i < MSRPM_OFFSETS; ++i) { + + /* Offset already in list? */ + if (msrpm_offsets[i] == offset) + return; + + /* Slot used by another offset? */ + if (msrpm_offsets[i] != MSR_INVALID) + continue; + + /* Add offset to list */ + msrpm_offsets[i] = offset; + + return; + } + + /* + * If this BUG triggers the msrpm_offsets table has an overflow. Just + * increase MSRPM_OFFSETS in this case. + */ + BUG(); +} + +static void init_msrpm_offsets(void) +{ + int i; + + memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); + + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { + u32 offset; + + offset = svm_msrpm_offset(direct_access_msrs[i].index); + BUG_ON(offset == MSR_INVALID); + + add_msr_offset(offset); + } +} + +static void svm_enable_lbrv(struct vcpu_svm *svm) +{ + u32 *msrpm = svm->msrpm; + + svm->vmcb->control.lbr_ctl = 1; + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); +} + +static void svm_disable_lbrv(struct vcpu_svm *svm) +{ + u32 *msrpm = svm->msrpm; + + svm->vmcb->control.lbr_ctl = 0; + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); +} + +static __init int svm_hardware_setup(void) +{ + int cpu; + struct page *iopm_pages; + void *iopm_va; + int r; + + iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); + + if (!iopm_pages) + return -ENOMEM; + + iopm_va = page_address(iopm_pages); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + + init_msrpm_offsets(); + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + + if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) + kvm_enable_efer_bits(EFER_FFXSR); + + if (nested) { + printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + } + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } + + svm_features = cpuid_edx(SVM_CPUID_FUNC); + + if (!boot_cpu_has(X86_FEATURE_NPT)) + npt_enabled = false; + + if (npt_enabled && !npt) { + printk(KERN_INFO "kvm: Nested Paging disabled\n"); + npt_enabled = false; + } + + if (npt_enabled) { + printk(KERN_INFO "kvm: Nested Paging enabled\n"); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); + + return 0; + +err: + __free_pages(iopm_pages, IOPM_ALLOC_ORDER); + iopm_base = 0; + return r; +} + +static __exit void svm_hardware_unsetup(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + svm_cpu_uninit(cpu); + + __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); + iopm_base = 0; +} + +static void init_seg(struct vmcb_seg *seg) +{ + seg->selector = 0; + seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | + SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ + seg->limit = 0xffff; + seg->base = 0; +} + +static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) +{ + seg->selector = 0; + seg->attrib = SVM_SELECTOR_P_MASK | type; + seg->limit = 0xffff; + seg->base = 0; +} + +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 g_tsc_offset = 0; + + if (is_guest_mode(vcpu)) { + g_tsc_offset = svm->vmcb->control.tsc_offset - + svm->nested.hsave->control.tsc_offset; + svm->nested.hsave->control.tsc_offset = offset; + } + + svm->vmcb->control.tsc_offset = offset + g_tsc_offset; + + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); +} + +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.tsc_offset += adjustment; + if (is_guest_mode(vcpu)) + svm->nested.hsave->control.tsc_offset += adjustment; + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); +} + +static void init_vmcb(struct vcpu_svm *svm) +{ + struct vmcb_control_area *control = &svm->vmcb->control; + struct vmcb_save_area *save = &svm->vmcb->save; + + svm->vcpu.fpu_active = 1; + svm->vcpu.arch.hflags = 0; + + set_cr_intercept(svm, INTERCEPT_CR0_READ); + set_cr_intercept(svm, INTERCEPT_CR3_READ); + set_cr_intercept(svm, INTERCEPT_CR4_READ); + set_cr_intercept(svm, INTERCEPT_CR0_WRITE); + set_cr_intercept(svm, INTERCEPT_CR3_WRITE); + set_cr_intercept(svm, INTERCEPT_CR4_WRITE); + set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + + set_dr_intercept(svm, INTERCEPT_DR0_READ); + set_dr_intercept(svm, INTERCEPT_DR1_READ); + set_dr_intercept(svm, INTERCEPT_DR2_READ); + set_dr_intercept(svm, INTERCEPT_DR3_READ); + set_dr_intercept(svm, INTERCEPT_DR4_READ); + set_dr_intercept(svm, INTERCEPT_DR5_READ); + set_dr_intercept(svm, INTERCEPT_DR6_READ); + set_dr_intercept(svm, INTERCEPT_DR7_READ); + + set_dr_intercept(svm, INTERCEPT_DR0_WRITE); + set_dr_intercept(svm, INTERCEPT_DR1_WRITE); + set_dr_intercept(svm, INTERCEPT_DR2_WRITE); + set_dr_intercept(svm, INTERCEPT_DR3_WRITE); + set_dr_intercept(svm, INTERCEPT_DR4_WRITE); + set_dr_intercept(svm, INTERCEPT_DR5_WRITE); + set_dr_intercept(svm, INTERCEPT_DR6_WRITE); + set_dr_intercept(svm, INTERCEPT_DR7_WRITE); + + set_exception_intercept(svm, PF_VECTOR); + set_exception_intercept(svm, UD_VECTOR); + set_exception_intercept(svm, MC_VECTOR); + + set_intercept(svm, INTERCEPT_INTR); + set_intercept(svm, INTERCEPT_NMI); + set_intercept(svm, INTERCEPT_SMI); + set_intercept(svm, INTERCEPT_SELECTIVE_CR0); + set_intercept(svm, INTERCEPT_CPUID); + set_intercept(svm, INTERCEPT_INVD); + set_intercept(svm, INTERCEPT_HLT); + set_intercept(svm, INTERCEPT_INVLPG); + set_intercept(svm, INTERCEPT_INVLPGA); + set_intercept(svm, INTERCEPT_IOIO_PROT); + set_intercept(svm, INTERCEPT_MSR_PROT); + set_intercept(svm, INTERCEPT_TASK_SWITCH); + set_intercept(svm, INTERCEPT_SHUTDOWN); + set_intercept(svm, INTERCEPT_VMRUN); + set_intercept(svm, INTERCEPT_VMMCALL); + set_intercept(svm, INTERCEPT_VMLOAD); + set_intercept(svm, INTERCEPT_VMSAVE); + set_intercept(svm, INTERCEPT_STGI); + set_intercept(svm, INTERCEPT_CLGI); + set_intercept(svm, INTERCEPT_SKINIT); + set_intercept(svm, INTERCEPT_WBINVD); + set_intercept(svm, INTERCEPT_MONITOR); + set_intercept(svm, INTERCEPT_MWAIT); + set_intercept(svm, INTERCEPT_XSETBV); + + control->iopm_base_pa = iopm_base; + control->msrpm_base_pa = __pa(svm->msrpm); + control->int_ctl = V_INTR_MASKING_MASK; + + init_seg(&save->es); + init_seg(&save->ss); + init_seg(&save->ds); + init_seg(&save->fs); + init_seg(&save->gs); + + save->cs.selector = 0xf000; + /* Executable/Readable Code Segment */ + save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | + SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; + save->cs.limit = 0xffff; + /* + * cs.base should really be 0xffff0000, but vmx can't handle that, so + * be consistent with it. + * + * Replace when we have real mode working for vmx. + */ + save->cs.base = 0xf0000; + + save->gdtr.limit = 0xffff; + save->idtr.limit = 0xffff; + + init_sys_seg(&save->ldtr, SEG_TYPE_LDT); + init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); + + svm_set_efer(&svm->vcpu, 0); + save->dr6 = 0xffff0ff0; + save->dr7 = 0x400; + save->rflags = 2; + save->rip = 0x0000fff0; + svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; + + /* + * This is the guest-visible cr0 value. + * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. + */ + svm->vcpu.arch.cr0 = 0; + (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); + + save->cr4 = X86_CR4_PAE; + /* rdx = ?? */ + + if (npt_enabled) { + /* Setup VMCB for Nested Paging */ + control->nested_ctl = 1; + clr_intercept(svm, INTERCEPT_TASK_SWITCH); + clr_intercept(svm, INTERCEPT_INVLPG); + clr_exception_intercept(svm, PF_VECTOR); + clr_cr_intercept(svm, INTERCEPT_CR3_READ); + clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); + save->g_pat = 0x0007040600070406ULL; + save->cr3 = 0; + save->cr4 = 0; + } + svm->asid_generation = 0; + + svm->nested.vmcb = 0; + svm->vcpu.arch.hflags = 0; + + if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { + control->pause_filter_count = 3000; + set_intercept(svm, INTERCEPT_PAUSE); + } + + mark_all_dirty(svm->vmcb); + + enable_gif(svm); +} + +static int svm_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + init_vmcb(svm); + + if (!kvm_vcpu_is_bsp(vcpu)) { + kvm_rip_write(vcpu, 0); + svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; + svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; + } + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; + + return 0; +} + +static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) +{ + struct vcpu_svm *svm; + struct page *page; + struct page *msrpm_pages; + struct page *hsave_page; + struct page *nested_msrpm_pages; + int err; + + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!svm) { + err = -ENOMEM; + goto out; + } + + err = kvm_vcpu_init(&svm->vcpu, kvm, id); + if (err) + goto free_svm; + + err = -ENOMEM; + page = alloc_page(GFP_KERNEL); + if (!page) + goto uninit; + + msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + if (!msrpm_pages) + goto free_page1; + + nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + if (!nested_msrpm_pages) + goto free_page2; + + hsave_page = alloc_page(GFP_KERNEL); + if (!hsave_page) + goto free_page3; + + svm->nested.hsave = page_address(hsave_page); + + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + + svm->nested.msrpm = page_address(nested_msrpm_pages); + svm_vcpu_init_msrpm(svm->nested.msrpm); + + svm->vmcb = page_address(page); + clear_page(svm->vmcb); + svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; + svm->asid_generation = 0; + init_vmcb(svm); + kvm_write_tsc(&svm->vcpu, 0); + + err = fx_init(&svm->vcpu); + if (err) + goto free_page4; + + svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_bsp(&svm->vcpu)) + svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + + return &svm->vcpu; + +free_page4: + __free_page(hsave_page); +free_page3: + __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); +free_page2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); +free_page1: + __free_page(page); +uninit: + kvm_vcpu_uninit(&svm->vcpu); +free_svm: + kmem_cache_free(kvm_vcpu_cache, svm); +out: + return ERR_PTR(err); +} + +static void svm_free_vcpu(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); + __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); + __free_page(virt_to_page(svm->nested.hsave)); + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, svm); +} + +static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int i; + + if (unlikely(cpu != vcpu->cpu)) { + svm->asid_generation = 0; + mark_all_dirty(svm->vmcb); + } + +#ifdef CONFIG_X86_64 + rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); +#endif + savesegment(fs, svm->host.fs); + savesegment(gs, svm->host.gs); + svm->host.ldt = kvm_read_ldt(); + + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); +} + +static void svm_vcpu_put(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int i; + + ++vcpu->stat.host_state_reload; + kvm_load_ldt(svm->host.ldt); +#ifdef CONFIG_X86_64 + loadsegment(fs, svm->host.fs); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); + load_gs_index(svm->host.gs); +#else + loadsegment(gs, svm->host.gs); +#endif + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); +} + +static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) +{ + return to_svm(vcpu)->vmcb->save.rflags; +} + +static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + to_svm(vcpu)->vmcb->save.rflags = rflags; +} + +static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + switch (reg) { + case VCPU_EXREG_PDPTR: + BUG_ON(!npt_enabled); + load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + break; + default: + BUG(); + } +} + +static void svm_set_vintr(struct vcpu_svm *svm) +{ + set_intercept(svm, INTERCEPT_VINTR); +} + +static void svm_clear_vintr(struct vcpu_svm *svm) +{ + clr_intercept(svm, INTERCEPT_VINTR); +} + +static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) +{ + struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; + + switch (seg) { + case VCPU_SREG_CS: return &save->cs; + case VCPU_SREG_DS: return &save->ds; + case VCPU_SREG_ES: return &save->es; + case VCPU_SREG_FS: return &save->fs; + case VCPU_SREG_GS: return &save->gs; + case VCPU_SREG_SS: return &save->ss; + case VCPU_SREG_TR: return &save->tr; + case VCPU_SREG_LDTR: return &save->ldtr; + } + BUG(); + return NULL; +} + +static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct vmcb_seg *s = svm_seg(vcpu, seg); + + return s->base; +} + +static void svm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vmcb_seg *s = svm_seg(vcpu, seg); + + var->base = s->base; + var->limit = s->limit; + var->selector = s->selector; + var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; + var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; + var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; + var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; + var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; + var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; + var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; + var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + + /* + * AMD's VMCB does not have an explicit unusable field, so emulate it + * for cross vendor migration purposes by "not present" + */ + var->unusable = !var->present || (var->type == 0); + + switch (seg) { + case VCPU_SREG_CS: + /* + * SVM always stores 0 for the 'G' bit in the CS selector in + * the VMCB on a VMEXIT. This hurts cross-vendor migration: + * Intel's VMENTRY has a check on the 'G' bit. + */ + var->g = s->limit > 0xfffff; + break; + case VCPU_SREG_TR: + /* + * Work around a bug where the busy flag in the tr selector + * isn't exposed + */ + var->type |= 0x2; + break; + case VCPU_SREG_DS: + case VCPU_SREG_ES: + case VCPU_SREG_FS: + case VCPU_SREG_GS: + /* + * The accessed bit must always be set in the segment + * descriptor cache, although it can be cleared in the + * descriptor, the cached bit always remains at 1. Since + * Intel has a check on this, set it here to support + * cross-vendor migration. + */ + if (!var->unusable) + var->type |= 0x1; + break; + case VCPU_SREG_SS: + /* + * On AMD CPUs sometimes the DB bit in the segment + * descriptor is left as 1, although the whole segment has + * been made unusable. Clear it here to pass an Intel VMX + * entry check when cross vendor migrating. + */ + if (var->unusable) + var->db = 0; + break; + } +} + +static int svm_get_cpl(struct kvm_vcpu *vcpu) +{ + struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; + + return save->cpl; +} + +static void svm_get_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + dt->size = svm->vmcb->save.idtr.limit; + dt->address = svm->vmcb->save.idtr.base; +} + +static void svm_set_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.idtr.limit = dt->size; + svm->vmcb->save.idtr.base = dt->address ; + mark_dirty(svm->vmcb, VMCB_DT); +} + +static void svm_get_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + dt->size = svm->vmcb->save.gdtr.limit; + dt->address = svm->vmcb->save.gdtr.base; +} + +static void svm_set_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.gdtr.limit = dt->size; + svm->vmcb->save.gdtr.base = dt->address ; + mark_dirty(svm->vmcb, VMCB_DT); +} + +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ +} + +static void svm_decache_cr3(struct kvm_vcpu *vcpu) +{ +} + +static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) +{ +} + +static void update_cr0_intercept(struct vcpu_svm *svm) +{ + ulong gcr0 = svm->vcpu.arch.cr0; + u64 *hcr0 = &svm->vmcb->save.cr0; + + if (!svm->vcpu.fpu_active) + *hcr0 |= SVM_CR0_SELECTIVE_MASK; + else + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); + + mark_dirty(svm->vmcb, VMCB_CR); + + if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + clr_cr_intercept(svm, INTERCEPT_CR0_READ); + clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); + } else { + set_cr_intercept(svm, INTERCEPT_CR0_READ); + set_cr_intercept(svm, INTERCEPT_CR0_WRITE); + } +} + +static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_guest_mode(vcpu)) { + /* + * We are here because we run in nested mode, the host kvm + * intercepts cr0 writes but the l1 hypervisor does not. + * But the L1 hypervisor may intercept selective cr0 writes. + * This needs to be checked here. + */ + unsigned long old, new; + + /* Remove bits that would trigger a real cr0 write intercept */ + old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK; + new = cr0 & SVM_CR0_SELECTIVE_MASK; + + if (old == new) { + /* cr0 write with ts and mp unchanged */ + svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; + if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { + svm->nested.vmexit_rip = kvm_rip_read(vcpu); + svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); + return; + } + } + } + +#ifdef CONFIG_X86_64 + if (vcpu->arch.efer & EFER_LME) { + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { + vcpu->arch.efer |= EFER_LMA; + svm->vmcb->save.efer |= EFER_LMA | EFER_LME; + } + + if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { + vcpu->arch.efer &= ~EFER_LMA; + svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); + } + } +#endif + vcpu->arch.cr0 = cr0; + + if (!npt_enabled) + cr0 |= X86_CR0_PG | X86_CR0_WP; + + if (!vcpu->fpu_active) + cr0 |= X86_CR0_TS; + /* + * re-enable caching here because the QEMU bios + * does not do it - this results in some delay at + * reboot + */ + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + svm->vmcb->save.cr0 = cr0; + mark_dirty(svm->vmcb, VMCB_CR); + update_cr0_intercept(svm); +} + +static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; + unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; + + if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) + svm_flush_tlb(vcpu); + + vcpu->arch.cr4 = cr4; + if (!npt_enabled) + cr4 |= X86_CR4_PAE; + cr4 |= host_cr4_mce; + to_svm(vcpu)->vmcb->save.cr4 = cr4; + mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); +} + +static void svm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_seg *s = svm_seg(vcpu, seg); + + s->base = var->base; + s->limit = var->limit; + s->selector = var->selector; + if (var->unusable) + s->attrib = 0; + else { + s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); + s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; + s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; + s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; + s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; + s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; + s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; + s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; + } + if (seg == VCPU_SREG_CS) + svm->vmcb->save.cpl + = (svm->vmcb->save.cs.attrib + >> SVM_SELECTOR_DPL_SHIFT) & 3; + + mark_dirty(svm->vmcb, VMCB_SEG); +} + +static void update_db_intercept(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + clr_exception_intercept(svm, DB_VECTOR); + clr_exception_intercept(svm, BP_VECTOR); + + if (svm->nmi_singlestep) + set_exception_intercept(svm, DB_VECTOR); + + if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + set_exception_intercept(svm, DB_VECTOR); + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + set_exception_intercept(svm, BP_VECTOR); + } else + vcpu->guest_debug = 0; +} + +static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; + else + svm->vmcb->save.dr7 = vcpu->arch.dr7; + + mark_dirty(svm->vmcb, VMCB_DR); + + update_db_intercept(vcpu); +} + +static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) +{ + if (sd->next_asid > sd->max_asid) { + ++sd->asid_generation; + sd->next_asid = 1; + svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; + } + + svm->asid_generation = sd->asid_generation; + svm->vmcb->control.asid = sd->next_asid++; + + mark_dirty(svm->vmcb, VMCB_ASID); +} + +static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.dr7 = value; + mark_dirty(svm->vmcb, VMCB_DR); +} + +static int pf_interception(struct vcpu_svm *svm) +{ + u64 fault_address = svm->vmcb->control.exit_info_2; + u32 error_code; + int r = 1; + + switch (svm->apf_reason) { + default: + error_code = svm->vmcb->control.exit_info_1; + + trace_kvm_page_fault(fault_address, error_code); + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); + r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + svm->vmcb->control.insn_bytes, + svm->vmcb->control.insn_len); + break; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + svm->apf_reason = 0; + local_irq_disable(); + kvm_async_pf_task_wait(fault_address); + local_irq_enable(); + break; + case KVM_PV_REASON_PAGE_READY: + svm->apf_reason = 0; + local_irq_disable(); + kvm_async_pf_task_wake(fault_address); + local_irq_enable(); + break; + } + return r; +} + +static int db_interception(struct vcpu_svm *svm) +{ + struct kvm_run *kvm_run = svm->vcpu.run; + + if (!(svm->vcpu.guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && + !svm->nmi_singlestep) { + kvm_queue_exception(&svm->vcpu, DB_VECTOR); + return 1; + } + + if (svm->nmi_singlestep) { + svm->nmi_singlestep = false; + if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) + svm->vmcb->save.rflags &= + ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(&svm->vcpu); + } + + if (svm->vcpu.guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = + svm->vmcb->save.cs.base + svm->vmcb->save.rip; + kvm_run->debug.arch.exception = DB_VECTOR; + return 0; + } + + return 1; +} + +static int bp_interception(struct vcpu_svm *svm) +{ + struct kvm_run *kvm_run = svm->vcpu.run; + + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; + kvm_run->debug.arch.exception = BP_VECTOR; + return 0; +} + +static int ud_interception(struct vcpu_svm *svm) +{ + int er; + + er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); + if (er != EMULATE_DONE) + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; +} + +static void svm_fpu_activate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + clr_exception_intercept(svm, NM_VECTOR); + + svm->vcpu.fpu_active = 1; + update_cr0_intercept(svm); +} + +static int nm_interception(struct vcpu_svm *svm) +{ + svm_fpu_activate(&svm->vcpu); + return 1; +} + +static bool is_erratum_383(void) +{ + int err, i; + u64 value; + + if (!erratum_383_found) + return false; + + value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); + if (err) + return false; + + /* Bit 62 may or may not be set for this mce */ + value &= ~(1ULL << 62); + + if (value != 0xb600000000010015ULL) + return false; + + /* Clear MCi_STATUS registers */ + for (i = 0; i < 6; ++i) + kvm_native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); + + value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); + if (!err) { + u32 low, high; + + value &= ~(1ULL << 2); + low = lower_32_bits(value); + high = upper_32_bits(value); + + kvm_native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + } + + /* Flush tlb to evict multi-match entries */ + __flush_tlb_all(); + + return true; +} + +static void svm_handle_mce(struct vcpu_svm *svm) +{ + if (is_erratum_383()) { + /* + * Erratum 383 triggered. Guest state is corrupt so kill the + * guest. + */ + pr_err("KVM: Guest triggered AMD Erratum 383\n"); + + kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); + + return; + } + + /* + * On an #MC intercept the MCE handler is not called automatically in + * the host. So do it by hand here. + */ + asm volatile ( + "int $0x12\n"); + /* not sure if we ever come back to this point */ + + return; +} + +static int mc_interception(struct vcpu_svm *svm) +{ + return 1; +} + +static int shutdown_interception(struct vcpu_svm *svm) +{ + struct kvm_run *kvm_run = svm->vcpu.run; + + /* + * VMCB is undefined after a SHUTDOWN intercept + * so reinitialize it. + */ + clear_page(svm->vmcb); + init_vmcb(svm); + + kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + return 0; +} + +static int io_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ + int size, in, string; + unsigned port; + + ++svm->vcpu.stat.io_exits; + string = (io_info & SVM_IOIO_STR_MASK) != 0; + in = (io_info & SVM_IOIO_TYPE_MASK) != 0; + if (string || in) + return emulate_instruction(vcpu, 0) == EMULATE_DONE; + + port = io_info >> 16; + size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; + svm->next_rip = svm->vmcb->control.exit_info_2; + skip_emulated_instruction(&svm->vcpu); + + return kvm_fast_pio_out(vcpu, size, port); +} + +static int nmi_interception(struct vcpu_svm *svm) +{ + return 1; +} + +static int intr_interception(struct vcpu_svm *svm) +{ + ++svm->vcpu.stat.irq_exits; + return 1; +} + +static int nop_on_interception(struct vcpu_svm *svm) +{ + return 1; +} + +static int halt_interception(struct vcpu_svm *svm) +{ + svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; + skip_emulated_instruction(&svm->vcpu); + return kvm_emulate_halt(&svm->vcpu); +} + +static int vmmcall_interception(struct vcpu_svm *svm) +{ + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + kvm_emulate_hypercall(&svm->vcpu); + return 1; +} + +static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->nested.nested_cr3; +} + +static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, + unsigned long root) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.nested_cr3 = root; + mark_dirty(svm->vmcb, VMCB_NPT); + svm_flush_tlb(vcpu); +} + +static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.exit_code = SVM_EXIT_NPF; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = fault->error_code; + svm->vmcb->control.exit_info_2 = fault->address; + + nested_svm_vmexit(svm); +} + +static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) +{ + int r; + + r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); + + vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; + vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; + vcpu->arch.mmu.shadow_root_level = get_npt_level(); + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + + return r; +} + +static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + +static int nested_svm_check_permissions(struct vcpu_svm *svm) +{ + if (!(svm->vcpu.arch.efer & EFER_SVME) + || !is_paging(&svm->vcpu)) { + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + + if (svm->vmcb->save.cpl) { + kvm_inject_gp(&svm->vcpu, 0); + return 1; + } + + return 0; +} + +static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, + bool has_error_code, u32 error_code) +{ + int vmexit; + + if (!is_guest_mode(&svm->vcpu)) + return 0; + + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = error_code; + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + + vmexit = nested_svm_intercept(svm); + if (vmexit == NESTED_EXIT_DONE) + svm->nested.exit_required = true; + + return vmexit; +} + +/* This function returns true if it is save to enable the irq window */ +static inline bool nested_svm_intr(struct vcpu_svm *svm) +{ + if (!is_guest_mode(&svm->vcpu)) + return true; + + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + return true; + + if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) + return false; + + /* + * if vmexit was already requested (by intercepted exception + * for instance) do not overwrite it with "external interrupt" + * vmexit. + */ + if (svm->nested.exit_required) + return false; + + svm->vmcb->control.exit_code = SVM_EXIT_INTR; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + if (svm->nested.intercept & 1ULL) { + /* + * The #vmexit can't be emulated here directly because this + * code path runs with irqs and preemtion disabled. A + * #vmexit emulation might sleep. Only signal request for + * the #vmexit here. + */ + svm->nested.exit_required = true; + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); + return false; + } + + return true; +} + +/* This function returns true if it is save to enable the nmi window */ +static inline bool nested_svm_nmi(struct vcpu_svm *svm) +{ + if (!is_guest_mode(&svm->vcpu)) + return true; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) + return true; + + svm->vmcb->control.exit_code = SVM_EXIT_NMI; + svm->nested.exit_required = true; + + return false; +} + +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) +{ + struct page *page; + + might_sleep(); + + page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); + if (is_error_page(page)) + goto error; + + *_page = page; + + return kmap(page); + +error: + kvm_release_page_clean(page); + kvm_inject_gp(&svm->vcpu, 0); + + return NULL; +} + +static void nested_svm_unmap(struct page *page) +{ + kunmap(page); + kvm_release_page_dirty(page); +} + +static int nested_svm_intercept_ioio(struct vcpu_svm *svm) +{ + unsigned port; + u8 val, bit; + u64 gpa; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) + return NESTED_EXIT_HOST; + + port = svm->vmcb->control.exit_info_1 >> 16; + gpa = svm->nested.vmcb_iopm + (port / 8); + bit = port % 8; + val = 0; + + if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1)) + val &= (1 << bit); + + return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; +} + +static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) +{ + u32 offset, msr, value; + int write, mask; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return NESTED_EXIT_HOST; + + msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + offset = svm_msrpm_offset(msr); + write = svm->vmcb->control.exit_info_1 & 1; + mask = 1 << ((2 * (msr & 0xf)) + write); + + if (offset == MSR_INVALID) + return NESTED_EXIT_DONE; + + /* Offset is in 32 bit units but need in 8 bit units */ + offset *= 4; + + if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4)) + return NESTED_EXIT_DONE; + + return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; +} + +static int nested_svm_exit_special(struct vcpu_svm *svm) +{ + u32 exit_code = svm->vmcb->control.exit_code; + + switch (exit_code) { + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: + case SVM_EXIT_EXCP_BASE + MC_VECTOR: + return NESTED_EXIT_HOST; + case SVM_EXIT_NPF: + /* For now we are always handling NPFs when using them */ + if (npt_enabled) + return NESTED_EXIT_HOST; + break; + case SVM_EXIT_EXCP_BASE + PF_VECTOR: + /* When we're shadowing, trap PFs, but not async PF */ + if (!npt_enabled && svm->apf_reason == 0) + return NESTED_EXIT_HOST; + break; + case SVM_EXIT_EXCP_BASE + NM_VECTOR: + nm_interception(svm); + break; + default: + break; + } + + return NESTED_EXIT_CONTINUE; +} + +/* + * If this function returns true, this #vmexit was already handled + */ +static int nested_svm_intercept(struct vcpu_svm *svm) +{ + u32 exit_code = svm->vmcb->control.exit_code; + int vmexit = NESTED_EXIT_HOST; + + switch (exit_code) { + case SVM_EXIT_MSR: + vmexit = nested_svm_exit_handled_msr(svm); + break; + case SVM_EXIT_IOIO: + vmexit = nested_svm_intercept_ioio(svm); + break; + case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { + u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); + if (svm->nested.intercept_cr & bit) + vmexit = NESTED_EXIT_DONE; + break; + } + case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { + u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); + if (svm->nested.intercept_dr & bit) + vmexit = NESTED_EXIT_DONE; + break; + } + case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); + if (svm->nested.intercept_exceptions & excp_bits) + vmexit = NESTED_EXIT_DONE; + /* async page fault always cause vmexit */ + else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && + svm->apf_reason != 0) + vmexit = NESTED_EXIT_DONE; + break; + } + case SVM_EXIT_ERR: { + vmexit = NESTED_EXIT_DONE; + break; + } + default: { + u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); + if (svm->nested.intercept & exit_bits) + vmexit = NESTED_EXIT_DONE; + } + } + + return vmexit; +} + +static int nested_svm_exit_handled(struct vcpu_svm *svm) +{ + int vmexit; + + vmexit = nested_svm_intercept(svm); + + if (vmexit == NESTED_EXIT_DONE) + nested_svm_vmexit(svm); + + return vmexit; +} + +static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) +{ + struct vmcb_control_area *dst = &dst_vmcb->control; + struct vmcb_control_area *from = &from_vmcb->control; + + dst->intercept_cr = from->intercept_cr; + dst->intercept_dr = from->intercept_dr; + dst->intercept_exceptions = from->intercept_exceptions; + dst->intercept = from->intercept; + dst->iopm_base_pa = from->iopm_base_pa; + dst->msrpm_base_pa = from->msrpm_base_pa; + dst->tsc_offset = from->tsc_offset; + dst->asid = from->asid; + dst->tlb_ctl = from->tlb_ctl; + dst->int_ctl = from->int_ctl; + dst->int_vector = from->int_vector; + dst->int_state = from->int_state; + dst->exit_code = from->exit_code; + dst->exit_code_hi = from->exit_code_hi; + dst->exit_info_1 = from->exit_info_1; + dst->exit_info_2 = from->exit_info_2; + dst->exit_int_info = from->exit_int_info; + dst->exit_int_info_err = from->exit_int_info_err; + dst->nested_ctl = from->nested_ctl; + dst->event_inj = from->event_inj; + dst->event_inj_err = from->event_inj_err; + dst->nested_cr3 = from->nested_cr3; + dst->lbr_ctl = from->lbr_ctl; +} + +static int nested_svm_vmexit(struct vcpu_svm *svm) +{ + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + struct page *page; + + trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, + vmcb->control.exit_info_1, + vmcb->control.exit_info_2, + vmcb->control.exit_int_info, + vmcb->control.exit_int_info_err); + + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); + if (!nested_vmcb) + return 1; + + /* Exit Guest-Mode */ + leave_guest_mode(&svm->vcpu); + svm->nested.vmcb = 0; + + /* Give the current vmcb to the guest */ + disable_gif(svm); + + nested_vmcb->save.es = vmcb->save.es; + nested_vmcb->save.cs = vmcb->save.cs; + nested_vmcb->save.ss = vmcb->save.ss; + nested_vmcb->save.ds = vmcb->save.ds; + nested_vmcb->save.gdtr = vmcb->save.gdtr; + nested_vmcb->save.idtr = vmcb->save.idtr; + nested_vmcb->save.efer = svm->vcpu.arch.efer; + nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); + nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); + nested_vmcb->save.cr2 = vmcb->save.cr2; + nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; + nested_vmcb->save.rflags = vmcb->save.rflags; + nested_vmcb->save.rip = vmcb->save.rip; + nested_vmcb->save.rsp = vmcb->save.rsp; + nested_vmcb->save.rax = vmcb->save.rax; + nested_vmcb->save.dr7 = vmcb->save.dr7; + nested_vmcb->save.dr6 = vmcb->save.dr6; + nested_vmcb->save.cpl = vmcb->save.cpl; + + nested_vmcb->control.int_ctl = vmcb->control.int_ctl; + nested_vmcb->control.int_vector = vmcb->control.int_vector; + nested_vmcb->control.int_state = vmcb->control.int_state; + nested_vmcb->control.exit_code = vmcb->control.exit_code; + nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; + nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; + nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; + nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; + nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + nested_vmcb->control.next_rip = vmcb->control.next_rip; + + /* + * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have + * to make sure that we do not lose injected events. So check event_inj + * here and copy it to exit_int_info if it is valid. + * Exit_int_info and event_inj can't be both valid because the case + * below only happens on a VMRUN instruction intercept which has + * no valid exit_int_info set. + */ + if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { + struct vmcb_control_area *nc = &nested_vmcb->control; + + nc->exit_int_info = vmcb->control.event_inj; + nc->exit_int_info_err = vmcb->control.event_inj_err; + } + + nested_vmcb->control.tlb_ctl = 0; + nested_vmcb->control.event_inj = 0; + nested_vmcb->control.event_inj_err = 0; + + /* We always set V_INTR_MASKING and remember the old value in hflags */ + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; + + /* Restore the original control entries */ + copy_vmcb_control_area(vmcb, hsave); + + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + svm->nested.nested_cr3 = 0; + + /* Restore selected save entries */ + svm->vmcb->save.es = hsave->save.es; + svm->vmcb->save.cs = hsave->save.cs; + svm->vmcb->save.ss = hsave->save.ss; + svm->vmcb->save.ds = hsave->save.ds; + svm->vmcb->save.gdtr = hsave->save.gdtr; + svm->vmcb->save.idtr = hsave->save.idtr; + svm->vmcb->save.rflags = hsave->save.rflags; + svm_set_efer(&svm->vcpu, hsave->save.efer); + svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); + svm_set_cr4(&svm->vcpu, hsave->save.cr4); + if (npt_enabled) { + svm->vmcb->save.cr3 = hsave->save.cr3; + svm->vcpu.arch.cr3 = hsave->save.cr3; + } else { + (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); + } + kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); + kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); + kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip); + svm->vmcb->save.dr7 = 0; + svm->vmcb->save.cpl = 0; + svm->vmcb->control.exit_int_info = 0; + + mark_all_dirty(svm->vmcb); + + nested_svm_unmap(page); + + nested_svm_uninit_mmu_context(&svm->vcpu); + kvm_mmu_reset_context(&svm->vcpu); + kvm_mmu_load(&svm->vcpu); + + return 0; +} + +static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) +{ + /* + * This function merges the msr permission bitmaps of kvm and the + * nested vmcb. It is omptimized in that it only merges the parts where + * the kvm msr permission bitmap may contain zero bits + */ + int i; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return true; + + for (i = 0; i < MSRPM_OFFSETS; i++) { + u32 value, p; + u64 offset; + + if (msrpm_offsets[i] == 0xffffffff) + break; + + p = msrpm_offsets[i]; + offset = svm->nested.vmcb_msrpm + (p * 4); + + if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4)) + return false; + + svm->nested.msrpm[p] = svm->msrpm[p] | value; + } + + svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); + + return true; +} + +static bool nested_vmcb_checks(struct vmcb *vmcb) +{ + if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + return false; + + if (vmcb->control.asid == 0) + return false; + + if (vmcb->control.nested_ctl && !npt_enabled) + return false; + + return true; +} + +static bool nested_svm_vmrun(struct vcpu_svm *svm) +{ + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + struct page *page; + u64 vmcb_gpa; + + vmcb_gpa = svm->vmcb->save.rax; + + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + if (!nested_vmcb) + return false; + + if (!nested_vmcb_checks(nested_vmcb)) { + nested_vmcb->control.exit_code = SVM_EXIT_ERR; + nested_vmcb->control.exit_code_hi = 0; + nested_vmcb->control.exit_info_1 = 0; + nested_vmcb->control.exit_info_2 = 0; + + nested_svm_unmap(page); + + return false; + } + + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, + nested_vmcb->save.rip, + nested_vmcb->control.int_ctl, + nested_vmcb->control.event_inj, + nested_vmcb->control.nested_ctl); + + trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, + nested_vmcb->control.intercept_cr >> 16, + nested_vmcb->control.intercept_exceptions, + nested_vmcb->control.intercept); + + /* Clear internal status */ + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + /* + * Save the old vmcb, so we don't need to pick what we save, but can + * restore everything when a VMEXIT occurs + */ + hsave->save.es = vmcb->save.es; + hsave->save.cs = vmcb->save.cs; + hsave->save.ss = vmcb->save.ss; + hsave->save.ds = vmcb->save.ds; + hsave->save.gdtr = vmcb->save.gdtr; + hsave->save.idtr = vmcb->save.idtr; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); + hsave->save.cr4 = svm->vcpu.arch.cr4; + hsave->save.rflags = vmcb->save.rflags; + hsave->save.rip = kvm_rip_read(&svm->vcpu); + hsave->save.rsp = vmcb->save.rsp; + hsave->save.rax = vmcb->save.rax; + if (npt_enabled) + hsave->save.cr3 = vmcb->save.cr3; + else + hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); + + copy_vmcb_control_area(hsave, vmcb); + + if (svm->vmcb->save.rflags & X86_EFLAGS_IF) + svm->vcpu.arch.hflags |= HF_HIF_MASK; + else + svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + + if (nested_vmcb->control.nested_ctl) { + kvm_mmu_unload(&svm->vcpu); + svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; + nested_svm_init_mmu_context(&svm->vcpu); + } + + /* Load the nested guest state */ + svm->vmcb->save.es = nested_vmcb->save.es; + svm->vmcb->save.cs = nested_vmcb->save.cs; + svm->vmcb->save.ss = nested_vmcb->save.ss; + svm->vmcb->save.ds = nested_vmcb->save.ds; + svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; + svm->vmcb->save.idtr = nested_vmcb->save.idtr; + svm->vmcb->save.rflags = nested_vmcb->save.rflags; + svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); + svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); + svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); + if (npt_enabled) { + svm->vmcb->save.cr3 = nested_vmcb->save.cr3; + svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; + } else + (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); + + /* Guest paging mode is active - reset mmu */ + kvm_mmu_reset_context(&svm->vcpu); + + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; + kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); + kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); + kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); + + /* In case we don't even reach vcpu_run, the fields are not updated */ + svm->vmcb->save.rax = nested_vmcb->save.rax; + svm->vmcb->save.rsp = nested_vmcb->save.rsp; + svm->vmcb->save.rip = nested_vmcb->save.rip; + svm->vmcb->save.dr7 = nested_vmcb->save.dr7; + svm->vmcb->save.dr6 = nested_vmcb->save.dr6; + svm->vmcb->save.cpl = nested_vmcb->save.cpl; + + svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; + svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; + + /* cache intercepts */ + svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; + svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; + svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; + svm->nested.intercept = nested_vmcb->control.intercept; + + svm_flush_tlb(&svm->vcpu); + svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; + if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) + svm->vcpu.arch.hflags |= HF_VINTR_MASK; + else + svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; + + if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { + /* We only want the cr8 intercept bits of the guest */ + clr_cr_intercept(svm, INTERCEPT_CR8_READ); + clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + } + + /* We don't want to see VMMCALLs from a nested guest */ + clr_intercept(svm, INTERCEPT_VMMCALL); + + svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; + svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; + svm->vmcb->control.int_state = nested_vmcb->control.int_state; + svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; + svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; + svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; + + nested_svm_unmap(page); + + /* Enter Guest-Mode */ + enter_guest_mode(&svm->vcpu); + + /* + * Merge guest and host intercepts - must be called with vcpu in + * guest-mode to take affect here + */ + recalc_intercepts(svm); + + svm->nested.vmcb = vmcb_gpa; + + enable_gif(svm); + + mark_all_dirty(svm->vmcb); + + return true; +} + +static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +{ + to_vmcb->save.fs = from_vmcb->save.fs; + to_vmcb->save.gs = from_vmcb->save.gs; + to_vmcb->save.tr = from_vmcb->save.tr; + to_vmcb->save.ldtr = from_vmcb->save.ldtr; + to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base; + to_vmcb->save.star = from_vmcb->save.star; + to_vmcb->save.lstar = from_vmcb->save.lstar; + to_vmcb->save.cstar = from_vmcb->save.cstar; + to_vmcb->save.sfmask = from_vmcb->save.sfmask; + to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; + to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; + to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; +} + +static int vmload_interception(struct vcpu_svm *svm) +{ + struct vmcb *nested_vmcb; + struct page *page; + + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(nested_vmcb, svm->vmcb); + nested_svm_unmap(page); + + return 1; +} + +static int vmsave_interception(struct vcpu_svm *svm) +{ + struct vmcb *nested_vmcb; + struct page *page; + + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(svm->vmcb, nested_vmcb); + nested_svm_unmap(page); + + return 1; +} + +static int vmrun_interception(struct vcpu_svm *svm) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + /* Save rip after vmrun instruction */ + kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); + + if (!nested_svm_vmrun(svm)) + return 1; + + if (!nested_svm_vmrun_msrpm(svm)) + goto failed; + + return 1; + +failed: + + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); + + return 1; +} + +static int stgi_interception(struct vcpu_svm *svm) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + + enable_gif(svm); + + return 1; +} + +static int clgi_interception(struct vcpu_svm *svm) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + disable_gif(svm); + + /* After a CLGI no interrupts should come */ + svm_clear_vintr(svm); + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + + mark_dirty(svm->vmcb, VMCB_INTR); + + return 1; +} + +static int invlpga_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + + trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX], + vcpu->arch.regs[VCPU_REGS_RAX]); + + /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ + kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + return 1; +} + +static int skinit_interception(struct vcpu_svm *svm) +{ + trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]); + + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; +} + +static int xsetbv_interception(struct vcpu_svm *svm) +{ + u64 new_bv = kvm_read_edx_eax(&svm->vcpu); + u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); + + if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + } + + return 1; +} + +static int invalid_op_interception(struct vcpu_svm *svm) +{ + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; +} + +static int task_switch_interception(struct vcpu_svm *svm) +{ + u16 tss_selector; + int reason; + int int_type = svm->vmcb->control.exit_int_info & + SVM_EXITINTINFO_TYPE_MASK; + int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; + uint32_t type = + svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; + uint32_t idt_v = + svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; + bool has_error_code = false; + u32 error_code = 0; + + tss_selector = (u16)svm->vmcb->control.exit_info_1; + + if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) + reason = TASK_SWITCH_IRET; + else if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) + reason = TASK_SWITCH_JMP; + else if (idt_v) + reason = TASK_SWITCH_GATE; + else + reason = TASK_SWITCH_CALL; + + if (reason == TASK_SWITCH_GATE) { + switch (type) { + case SVM_EXITINTINFO_TYPE_NMI: + svm->vcpu.arch.nmi_injected = false; + break; + case SVM_EXITINTINFO_TYPE_EXEPT: + if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { + has_error_code = true; + error_code = + (u32)svm->vmcb->control.exit_info_2; + } + kvm_clear_exception_queue(&svm->vcpu); + break; + case SVM_EXITINTINFO_TYPE_INTR: + kvm_clear_interrupt_queue(&svm->vcpu); + break; + default: + break; + } + } + + if (reason != TASK_SWITCH_GATE || + int_type == SVM_EXITINTINFO_TYPE_SOFT || + (int_type == SVM_EXITINTINFO_TYPE_EXEPT && + (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) + skip_emulated_instruction(&svm->vcpu); + + if (kvm_task_switch(&svm->vcpu, tss_selector, reason, + has_error_code, error_code) == EMULATE_FAIL) { + svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + svm->vcpu.run->internal.ndata = 0; + return 0; + } + return 1; +} + +static int cpuid_interception(struct vcpu_svm *svm) +{ + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; + kvm_emulate_cpuid(&svm->vcpu); + return 1; +} + +static int iret_interception(struct vcpu_svm *svm) +{ + ++svm->vcpu.stat.nmi_window_exits; + clr_intercept(svm, INTERCEPT_IRET); + svm->vcpu.arch.hflags |= HF_IRET_MASK; + return 1; +} + +static int invlpg_interception(struct vcpu_svm *svm) +{ + if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) + return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + + kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); + skip_emulated_instruction(&svm->vcpu); + return 1; +} + +static int emulate_on_interception(struct vcpu_svm *svm) +{ + return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; +} + +#define CR_VALID (1ULL << 63) + +static int cr_interception(struct vcpu_svm *svm) +{ + int reg, cr; + unsigned long val; + int err; + + if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) + return emulate_on_interception(svm); + + if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) + return emulate_on_interception(svm); + + reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; + cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; + + err = 0; + if (cr >= 16) { /* mov to cr */ + cr -= 16; + val = kvm_register_read(&svm->vcpu, reg); + switch (cr) { + case 0: + err = kvm_set_cr0(&svm->vcpu, val); + break; + case 3: + err = kvm_set_cr3(&svm->vcpu, val); + break; + case 4: + err = kvm_set_cr4(&svm->vcpu, val); + break; + case 8: + err = kvm_set_cr8(&svm->vcpu, val); + break; + default: + WARN(1, "unhandled write to CR%d", cr); + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + } else { /* mov from cr */ + switch (cr) { + case 0: + val = kvm_read_cr0(&svm->vcpu); + break; + case 2: + val = svm->vcpu.arch.cr2; + break; + case 3: + val = kvm_read_cr3(&svm->vcpu); + break; + case 4: + val = kvm_read_cr4(&svm->vcpu); + break; + case 8: + val = kvm_get_cr8(&svm->vcpu); + break; + default: + WARN(1, "unhandled read from CR%d", cr); + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + kvm_register_write(&svm->vcpu, reg, val); + } + kvm_complete_insn_gp(&svm->vcpu, err); + + return 1; +} + +static int cr0_write_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + int r; + + r = cr_interception(svm); + + if (svm->nested.vmexit_rip) { + kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); + kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); + kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); + svm->nested.vmexit_rip = 0; + } + + return r; +} + +static int dr_interception(struct vcpu_svm *svm) +{ + int reg, dr; + unsigned long val; + int err; + + if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) + return emulate_on_interception(svm); + + reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; + dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; + + if (dr >= 16) { /* mov to DRn */ + val = kvm_register_read(&svm->vcpu, reg); + kvm_set_dr(&svm->vcpu, dr - 16, val); + } else { + err = kvm_get_dr(&svm->vcpu, dr, &val); + if (!err) + kvm_register_write(&svm->vcpu, reg, val); + } + + skip_emulated_instruction(&svm->vcpu); + + return 1; +} + +static int cr8_write_interception(struct vcpu_svm *svm) +{ + struct kvm_run *kvm_run = svm->vcpu.run; + int r; + + u8 cr8_prev = kvm_get_cr8(&svm->vcpu); + /* instruction emulation calls kvm_set_cr8() */ + r = cr_interception(svm); + if (irqchip_in_kernel(svm->vcpu.kvm)) { + clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + return r; + } + if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) + return r; + kvm_run->exit_reason = KVM_EXIT_SET_TPR; + return 0; +} + +static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + switch (ecx) { + case MSR_IA32_TSC: { + struct vmcb *vmcb = get_host_vmcb(svm); + + *data = vmcb->control.tsc_offset + kvm_native_read_tsc(); + break; + } + case MSR_STAR: + *data = svm->vmcb->save.star; + break; +#ifdef CONFIG_X86_64 + case MSR_LSTAR: + *data = svm->vmcb->save.lstar; + break; + case MSR_CSTAR: + *data = svm->vmcb->save.cstar; + break; + case MSR_KERNEL_GS_BASE: + *data = svm->vmcb->save.kernel_gs_base; + break; + case MSR_SYSCALL_MASK: + *data = svm->vmcb->save.sfmask; + break; +#endif + case MSR_IA32_SYSENTER_CS: + *data = svm->vmcb->save.sysenter_cs; + break; + case MSR_IA32_SYSENTER_EIP: + *data = svm->sysenter_eip; + break; + case MSR_IA32_SYSENTER_ESP: + *data = svm->sysenter_esp; + break; + /* + * Nobody will change the following 5 values in the VMCB so we can + * safely return them on rdmsr. They will always be 0 until LBRV is + * implemented. + */ + case MSR_IA32_DEBUGCTLMSR: + *data = svm->vmcb->save.dbgctl; + break; + case MSR_IA32_LASTBRANCHFROMIP: + *data = svm->vmcb->save.br_from; + break; + case MSR_IA32_LASTBRANCHTOIP: + *data = svm->vmcb->save.br_to; + break; + case MSR_IA32_LASTINTFROMIP: + *data = svm->vmcb->save.last_excp_from; + break; + case MSR_IA32_LASTINTTOIP: + *data = svm->vmcb->save.last_excp_to; + break; + case MSR_VM_HSAVE_PA: + *data = svm->nested.hsave_msr; + break; + case MSR_VM_CR: + *data = svm->nested.vm_cr_msr; + break; + case MSR_IA32_UCODE_REV: + *data = 0x01000065; + break; + default: + return kvm_get_msr_common(vcpu, ecx, data); + } + return 0; +} + +static int rdmsr_interception(struct vcpu_svm *svm) +{ + u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + u64 data; + + if (svm_get_msr(&svm->vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); + kvm_inject_gp(&svm->vcpu, 0); + } else { + trace_kvm_msr_read(ecx, data); + + svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; + svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; + skip_emulated_instruction(&svm->vcpu); + } + return 1; +} + +static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int svm_dis, chg_mask; + + if (data & ~SVM_VM_CR_VALID_MASK) + return 1; + + chg_mask = SVM_VM_CR_VALID_MASK; + + if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) + chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); + + svm->nested.vm_cr_msr &= ~chg_mask; + svm->nested.vm_cr_msr |= (data & chg_mask); + + svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; + + /* check for svm_disable while efer.svme is set */ + if (svm_dis && (vcpu->arch.efer & EFER_SVME)) + return 1; + + return 0; +} + +static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + switch (ecx) { + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, data); + break; + case MSR_STAR: + svm->vmcb->save.star = data; + break; +#ifdef CONFIG_X86_64 + case MSR_LSTAR: + svm->vmcb->save.lstar = data; + break; + case MSR_CSTAR: + svm->vmcb->save.cstar = data; + break; + case MSR_KERNEL_GS_BASE: + svm->vmcb->save.kernel_gs_base = data; + break; + case MSR_SYSCALL_MASK: + svm->vmcb->save.sfmask = data; + break; +#endif + case MSR_IA32_SYSENTER_CS: + svm->vmcb->save.sysenter_cs = data; + break; + case MSR_IA32_SYSENTER_EIP: + svm->sysenter_eip = data; + svm->vmcb->save.sysenter_eip = data; + break; + case MSR_IA32_SYSENTER_ESP: + svm->sysenter_esp = data; + svm->vmcb->save.sysenter_esp = data; + break; + case MSR_IA32_DEBUGCTLMSR: + if (!boot_cpu_has(X86_FEATURE_LBRV)) { + pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", + __func__, data); + break; + } + if (data & DEBUGCTL_RESERVED_BITS) + return 1; + + svm->vmcb->save.dbgctl = data; + mark_dirty(svm->vmcb, VMCB_LBR); + if (data & (1ULL<<0)) + svm_enable_lbrv(svm); + else + svm_disable_lbrv(svm); + break; + case MSR_VM_HSAVE_PA: + svm->nested.hsave_msr = data; + break; + case MSR_VM_CR: + return svm_set_vm_cr(vcpu, data); + case MSR_VM_IGNNE: + pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + break; + default: + return kvm_set_msr_common(vcpu, ecx, data); + } + return 0; +} + +static int wrmsr_interception(struct vcpu_svm *svm) +{ + u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) + | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); + + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; + if (svm_set_msr(&svm->vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); + kvm_inject_gp(&svm->vcpu, 0); + } else { + trace_kvm_msr_write(ecx, data); + skip_emulated_instruction(&svm->vcpu); + } + return 1; +} + +static int msr_interception(struct vcpu_svm *svm) +{ + if (svm->vmcb->control.exit_info_1) + return wrmsr_interception(svm); + else + return rdmsr_interception(svm); +} + +static int interrupt_window_interception(struct vcpu_svm *svm) +{ + struct kvm_run *kvm_run = svm->vcpu.run; + + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + svm_clear_vintr(svm); + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + mark_dirty(svm->vmcb, VMCB_INTR); + /* + * If the user space waits to inject interrupts, exit as soon as + * possible + */ + if (!irqchip_in_kernel(svm->vcpu.kvm) && + kvm_run->request_interrupt_window && + !kvm_cpu_has_interrupt(&svm->vcpu)) { + ++svm->vcpu.stat.irq_window_exits; + kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + return 0; + } + + return 1; +} + +static int pause_interception(struct vcpu_svm *svm) +{ + kvm_vcpu_on_spin(&(svm->vcpu)); + return 1; +} + +static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { + [SVM_EXIT_READ_CR0] = cr_interception, + [SVM_EXIT_READ_CR3] = cr_interception, + [SVM_EXIT_READ_CR4] = cr_interception, + [SVM_EXIT_READ_CR8] = cr_interception, + [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, + [SVM_EXIT_WRITE_CR0] = cr0_write_interception, + [SVM_EXIT_WRITE_CR3] = cr_interception, + [SVM_EXIT_WRITE_CR4] = cr_interception, + [SVM_EXIT_WRITE_CR8] = cr8_write_interception, + [SVM_EXIT_READ_DR0] = dr_interception, + [SVM_EXIT_READ_DR1] = dr_interception, + [SVM_EXIT_READ_DR2] = dr_interception, + [SVM_EXIT_READ_DR3] = dr_interception, + [SVM_EXIT_READ_DR4] = dr_interception, + [SVM_EXIT_READ_DR5] = dr_interception, + [SVM_EXIT_READ_DR6] = dr_interception, + [SVM_EXIT_READ_DR7] = dr_interception, + [SVM_EXIT_WRITE_DR0] = dr_interception, + [SVM_EXIT_WRITE_DR1] = dr_interception, + [SVM_EXIT_WRITE_DR2] = dr_interception, + [SVM_EXIT_WRITE_DR3] = dr_interception, + [SVM_EXIT_WRITE_DR4] = dr_interception, + [SVM_EXIT_WRITE_DR5] = dr_interception, + [SVM_EXIT_WRITE_DR6] = dr_interception, + [SVM_EXIT_WRITE_DR7] = dr_interception, + [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, + [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, + [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, + [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, + [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, + [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, + [SVM_EXIT_INTR] = intr_interception, + [SVM_EXIT_NMI] = nmi_interception, + [SVM_EXIT_SMI] = nop_on_interception, + [SVM_EXIT_INIT] = nop_on_interception, + [SVM_EXIT_VINTR] = interrupt_window_interception, + [SVM_EXIT_CPUID] = cpuid_interception, + [SVM_EXIT_IRET] = iret_interception, + [SVM_EXIT_INVD] = emulate_on_interception, + [SVM_EXIT_PAUSE] = pause_interception, + [SVM_EXIT_HLT] = halt_interception, + [SVM_EXIT_INVLPG] = invlpg_interception, + [SVM_EXIT_INVLPGA] = invlpga_interception, + [SVM_EXIT_IOIO] = io_interception, + [SVM_EXIT_MSR] = msr_interception, + [SVM_EXIT_TASK_SWITCH] = task_switch_interception, + [SVM_EXIT_SHUTDOWN] = shutdown_interception, + [SVM_EXIT_VMRUN] = vmrun_interception, + [SVM_EXIT_VMMCALL] = vmmcall_interception, + [SVM_EXIT_VMLOAD] = vmload_interception, + [SVM_EXIT_VMSAVE] = vmsave_interception, + [SVM_EXIT_STGI] = stgi_interception, + [SVM_EXIT_CLGI] = clgi_interception, + [SVM_EXIT_SKINIT] = skinit_interception, + [SVM_EXIT_WBINVD] = emulate_on_interception, + [SVM_EXIT_MONITOR] = invalid_op_interception, + [SVM_EXIT_MWAIT] = invalid_op_interception, + [SVM_EXIT_XSETBV] = xsetbv_interception, + [SVM_EXIT_NPF] = pf_interception, +}; + +void dump_vmcb(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + struct vmcb_save_area *save = &svm->vmcb->save; + + pr_err("VMCB Control Area:\n"); + pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff); + pr_err("cr_write: %04x\n", control->intercept_cr >> 16); + pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff); + pr_err("dr_write: %04x\n", control->intercept_dr >> 16); + pr_err("exceptions: %08x\n", control->intercept_exceptions); + pr_err("intercepts: %016llx\n", control->intercept); + pr_err("pause filter count: %d\n", control->pause_filter_count); + pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); + pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); + pr_err("tsc_offset: %016llx\n", control->tsc_offset); + pr_err("asid: %d\n", control->asid); + pr_err("tlb_ctl: %d\n", control->tlb_ctl); + pr_err("int_ctl: %08x\n", control->int_ctl); + pr_err("int_vector: %08x\n", control->int_vector); + pr_err("int_state: %08x\n", control->int_state); + pr_err("exit_code: %08x\n", control->exit_code); + pr_err("exit_info1: %016llx\n", control->exit_info_1); + pr_err("exit_info2: %016llx\n", control->exit_info_2); + pr_err("exit_int_info: %08x\n", control->exit_int_info); + pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); + pr_err("nested_ctl: %lld\n", control->nested_ctl); + pr_err("nested_cr3: %016llx\n", control->nested_cr3); + pr_err("event_inj: %08x\n", control->event_inj); + pr_err("event_inj_err: %08x\n", control->event_inj_err); + pr_err("lbr_ctl: %lld\n", control->lbr_ctl); + pr_err("next_rip: %016llx\n", control->next_rip); + pr_err("VMCB State Save Area:\n"); + pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", + save->es.selector, save->es.attrib, + save->es.limit, save->es.base); + pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", + save->cs.selector, save->cs.attrib, + save->cs.limit, save->cs.base); + pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", + save->ss.selector, save->ss.attrib, + save->ss.limit, save->ss.base); + pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", + save->ds.selector, save->ds.attrib, + save->ds.limit, save->ds.base); + pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", + save->fs.selector, save->fs.attrib, + save->fs.limit, save->fs.base); + pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", + save->gs.selector, save->gs.attrib, + save->gs.limit, save->gs.base); + pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->gdtr.selector, save->gdtr.attrib, + save->gdtr.limit, save->gdtr.base); + pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->ldtr.selector, save->ldtr.attrib, + save->ldtr.limit, save->ldtr.base); + pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->idtr.selector, save->idtr.attrib, + save->idtr.limit, save->idtr.base); + pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", + save->tr.selector, save->tr.attrib, + save->tr.limit, save->tr.base); + pr_err("cpl: %d efer: %016llx\n", + save->cpl, save->efer); + pr_err("cr0: %016llx cr2: %016llx\n", + save->cr0, save->cr2); + pr_err("cr3: %016llx cr4: %016llx\n", + save->cr3, save->cr4); + pr_err("dr6: %016llx dr7: %016llx\n", + save->dr6, save->dr7); + pr_err("rip: %016llx rflags: %016llx\n", + save->rip, save->rflags); + pr_err("rsp: %016llx rax: %016llx\n", + save->rsp, save->rax); + pr_err("star: %016llx lstar: %016llx\n", + save->star, save->lstar); + pr_err("cstar: %016llx sfmask: %016llx\n", + save->cstar, save->sfmask); + pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", + save->kernel_gs_base, save->sysenter_cs); + pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", + save->sysenter_esp, save->sysenter_eip); + pr_err("gpat: %016llx dbgctl: %016llx\n", + save->g_pat, save->dbgctl); + pr_err("br_from: %016llx br_to: %016llx\n", + save->br_from, save->br_to); + pr_err("excp_from: %016llx excp_to: %016llx\n", + save->last_excp_from, save->last_excp_to); + +} + +static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +{ + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + *info1 = control->exit_info_1; + *info2 = control->exit_info_2; +} + +static int handle_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_run *kvm_run = vcpu->run; + u32 exit_code = svm->vmcb->control.exit_code; + + trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + + if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) + vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) + vcpu->arch.cr3 = svm->vmcb->save.cr3; + + if (unlikely(svm->nested.exit_required)) { + nested_svm_vmexit(svm); + svm->nested.exit_required = false; + + return 1; + } + + if (is_guest_mode(vcpu)) { + int vmexit; + + trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, + svm->vmcb->control.exit_info_1, + svm->vmcb->control.exit_info_2, + svm->vmcb->control.exit_int_info, + svm->vmcb->control.exit_int_info_err); + + vmexit = nested_svm_exit_special(svm); + + if (vmexit == NESTED_EXIT_CONTINUE) + vmexit = nested_svm_exit_handled(svm); + + if (vmexit == NESTED_EXIT_DONE) + return 1; + } + + svm_complete_interrupts(svm); + + if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = svm->vmcb->control.exit_code; + pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); + dump_vmcb(vcpu); + return 0; + } + + if (is_external_interrupt(svm->vmcb->control.exit_int_info) && + exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && + exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && + exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) + printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " + "exit_code 0x%x\n", + __func__, svm->vmcb->control.exit_int_info, + exit_code); + + if (exit_code >= ARRAY_SIZE(svm_exit_handlers) + || !svm_exit_handlers[exit_code]) { + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = exit_code; + return 0; + } + + return svm_exit_handlers[exit_code](svm); +} + +static void reload_tss(struct kvm_vcpu *vcpu) +{ + int cpu = raw_smp_processor_id(); + + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + sd->tss_desc->type = 9; /* available 32/64-bit TSS */ + load_TR_desc(); +} + +static void pre_svm_run(struct vcpu_svm *svm) +{ + int cpu = raw_smp_processor_id(); + + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + + /* FIXME: handle wraparound of asid_generation */ + if (svm->asid_generation != sd->asid_generation) + new_asid(svm, sd); +} + +static void svm_inject_nmi(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + vcpu->arch.hflags |= HF_NMI_MASK; + set_intercept(svm, INTERCEPT_IRET); + ++vcpu->stat.nmi_injections; +} + +static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) +{ + struct vmcb_control_area *control; + + control = &svm->vmcb->control; + control->int_vector = irq; + control->int_ctl &= ~V_INTR_PRIO_MASK; + control->int_ctl |= V_IRQ_MASK | + ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); + mark_dirty(svm->vmcb, VMCB_INTR); +} + +static void svm_set_irq(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + BUG_ON(!(gif_set(svm))); + + trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + ++vcpu->stat.irq_injections; + + svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | + SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; +} + +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + + if (irr == -1) + return; + + if (tpr >= irr) + set_cr_intercept(svm, INTERCEPT_CR8_WRITE); +} + +static int svm_nmi_allowed(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + int ret; + ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + !(svm->vcpu.arch.hflags & HF_NMI_MASK); + ret = ret && gif_set(svm) && nested_svm_nmi(svm); + + return ret; +} + +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return !!(svm->vcpu.arch.hflags & HF_NMI_MASK); +} + +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (masked) { + svm->vcpu.arch.hflags |= HF_NMI_MASK; + set_intercept(svm, INTERCEPT_IRET); + } else { + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; + clr_intercept(svm, INTERCEPT_IRET); + } +} + +static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + int ret; + + if (!gif_set(svm) || + (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) + return 0; + + ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); + + if (is_guest_mode(vcpu)) + return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); + + return ret; +} + +static void enable_irq_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes + * 1, because that's a separate STGI/VMRUN intercept. The next time we + * get that intercept, this function will be called again though and + * we'll get the vintr intercept. + */ + if (gif_set(svm) && nested_svm_intr(svm)) { + svm_set_vintr(svm); + svm_inject_irq(svm, 0x0); + } +} + +static void enable_nmi_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) + == HF_NMI_MASK) + return; /* IRET will cause a vm exit */ + + /* + * Something prevents NMI from been injected. Single step over possible + * problem (IRET or exception injection or interrupt shadow) + */ + svm->nmi_singlestep = true; + svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(vcpu); +} + +static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + return 0; +} + +static void svm_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) + svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; + else + svm->asid_generation--; +} + +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) +{ +} + +static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + + if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { + int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; + kvm_set_cr8(vcpu, cr8); + } +} + +static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 cr8; + + if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + + cr8 = kvm_get_cr8(vcpu); + svm->vmcb->control.int_ctl &= ~V_TPR_MASK; + svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; +} + +static void svm_complete_interrupts(struct vcpu_svm *svm) +{ + u8 vector; + int type; + u32 exitintinfo = svm->vmcb->control.exit_int_info; + unsigned int3_injected = svm->int3_injected; + + svm->int3_injected = 0; + + if (svm->vcpu.arch.hflags & HF_IRET_MASK) { + svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + } + + svm->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + if (!(exitintinfo & SVM_EXITINTINFO_VALID)) + return; + + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + + vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; + type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; + + switch (type) { + case SVM_EXITINTINFO_TYPE_NMI: + svm->vcpu.arch.nmi_injected = true; + break; + case SVM_EXITINTINFO_TYPE_EXEPT: + /* + * In case of software exceptions, do not reinject the vector, + * but re-execute the instruction instead. Rewind RIP first + * if we emulated INT3 before. + */ + if (kvm_exception_is_soft(vector)) { + if (vector == BP_VECTOR && int3_injected && + kvm_is_linear_rip(&svm->vcpu, svm->int3_rip)) + kvm_rip_write(&svm->vcpu, + kvm_rip_read(&svm->vcpu) - + int3_injected); + break; + } + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { + u32 err = svm->vmcb->control.exit_int_info_err; + kvm_requeue_exception_e(&svm->vcpu, vector, err); + + } else + kvm_requeue_exception(&svm->vcpu, vector); + break; + case SVM_EXITINTINFO_TYPE_INTR: + kvm_queue_interrupt(&svm->vcpu, vector, false); + break; + default: + break; + } +} + +static void svm_cancel_injection(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + control->exit_int_info = control->event_inj; + control->exit_int_info_err = control->event_inj_err; + control->event_inj = 0; + svm_complete_interrupts(svm); +} + +#ifdef CONFIG_X86_64 +#define R "r" +#else +#define R "e" +#endif + +static void svm_vcpu_run(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + + /* + * A vmexit emulation is required before the vcpu can be executed + * again. + */ + if (unlikely(svm->nested.exit_required)) + return; + + pre_svm_run(svm); + + sync_lapic_to_cr8(vcpu); + + svm->vmcb->save.cr2 = vcpu->arch.cr2; + + clgi(); + + local_irq_enable(); + + asm volatile ( + "push %%"R"bp; \n\t" + "mov %c[rbx](%[svm]), %%"R"bx \n\t" + "mov %c[rcx](%[svm]), %%"R"cx \n\t" + "mov %c[rdx](%[svm]), %%"R"dx \n\t" + "mov %c[rsi](%[svm]), %%"R"si \n\t" + "mov %c[rdi](%[svm]), %%"R"di \n\t" + "mov %c[rbp](%[svm]), %%"R"bp \n\t" +#ifdef CONFIG_X86_64 + "mov %c[r8](%[svm]), %%r8 \n\t" + "mov %c[r9](%[svm]), %%r9 \n\t" + "mov %c[r10](%[svm]), %%r10 \n\t" + "mov %c[r11](%[svm]), %%r11 \n\t" + "mov %c[r12](%[svm]), %%r12 \n\t" + "mov %c[r13](%[svm]), %%r13 \n\t" + "mov %c[r14](%[svm]), %%r14 \n\t" + "mov %c[r15](%[svm]), %%r15 \n\t" +#endif + + /* Enter guest mode */ + "push %%"R"ax \n\t" + "mov %c[vmcb](%[svm]), %%"R"ax \n\t" + __ex(SVM_VMLOAD) "\n\t" + __ex(SVM_VMRUN) "\n\t" + __ex(SVM_VMSAVE) "\n\t" + "pop %%"R"ax \n\t" + + /* Save guest registers, load host registers */ + "mov %%"R"bx, %c[rbx](%[svm]) \n\t" + "mov %%"R"cx, %c[rcx](%[svm]) \n\t" + "mov %%"R"dx, %c[rdx](%[svm]) \n\t" + "mov %%"R"si, %c[rsi](%[svm]) \n\t" + "mov %%"R"di, %c[rdi](%[svm]) \n\t" + "mov %%"R"bp, %c[rbp](%[svm]) \n\t" +#ifdef CONFIG_X86_64 + "mov %%r8, %c[r8](%[svm]) \n\t" + "mov %%r9, %c[r9](%[svm]) \n\t" + "mov %%r10, %c[r10](%[svm]) \n\t" + "mov %%r11, %c[r11](%[svm]) \n\t" + "mov %%r12, %c[r12](%[svm]) \n\t" + "mov %%r13, %c[r13](%[svm]) \n\t" + "mov %%r14, %c[r14](%[svm]) \n\t" + "mov %%r15, %c[r15](%[svm]) \n\t" +#endif + "pop %%"R"bp" + : + : [svm]"a"(svm), + [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), + [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) +#ifdef CONFIG_X86_64 + , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), + [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), + [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) +#endif + : "cc", "memory" + , R"bx", R"cx", R"dx", R"si", R"di" +#ifdef CONFIG_X86_64 + , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" +#endif + ); + +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, svm->host.gs_base); +#else + loadsegment(fs, svm->host.fs); +#endif + + reload_tss(vcpu); + + local_irq_disable(); + + stgi(); + + vcpu->arch.cr2 = svm->vmcb->save.cr2; + vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + + sync_cr8_to_lapic(vcpu); + + svm->next_rip = 0; + + svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + + /* if exit due to PF check for async PF */ + if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) + svm->apf_reason = kvm_read_and_reset_pf_reason(); + + if (npt_enabled) { + vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); + vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); + } + + /* + * We need to handle MC intercepts here before the vcpu has a chance to + * change the physical cpu + */ + if (unlikely(svm->vmcb->control.exit_code == + SVM_EXIT_EXCP_BASE + MC_VECTOR)) + svm_handle_mce(svm); + + mark_all_clean(svm->vmcb); +} + +#undef R + +static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.cr3 = root; + mark_dirty(svm->vmcb, VMCB_CR); + svm_flush_tlb(vcpu); +} + +static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.nested_cr3 = root; + mark_dirty(svm->vmcb, VMCB_NPT); + + /* Also sync guest cr3 here in case we live migrate */ + svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); + mark_dirty(svm->vmcb, VMCB_CR); + + svm_flush_tlb(vcpu); +} + +static int is_disabled(void) +{ + u64 vm_cr; + + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) + return 1; + + return 0; +} + +static void +svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +{ + /* + * Patch in the VMMCALL instruction: + */ + hypercall[0] = 0x0f; + hypercall[1] = 0x01; + hypercall[2] = 0xd9; +} + +static void svm_check_processor_compat(void *rtn) +{ + *(int *)rtn = 0; +} + +static bool svm_cpu_has_accelerated_tpr(void) +{ + return false; +} + +static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +{ + return 0; +} + +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + +static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +{ + switch (func) { + case 0x80000001: + if (nested) + entry->ecx |= (1 << 2); /* Set SVM bit */ + break; + case 0x8000000A: + entry->eax = 1; /* SVM revision 1 */ + entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper + ASID emulation to nested SVM */ + entry->ecx = 0; /* Reserved */ + entry->edx = 0; /* Per default do not support any + additional features */ + + /* Support next_rip if host supports it */ + if (boot_cpu_has(X86_FEATURE_NRIPS)) + entry->edx |= SVM_FEATURE_NRIP; + + /* Support NPT for the guest if enabled */ + if (npt_enabled) + entry->edx |= SVM_FEATURE_NPT; + + break; + } +} + +static const struct trace_print_flags svm_exit_reasons_str[] = { + { SVM_EXIT_READ_CR0, "read_cr0" }, + { SVM_EXIT_READ_CR3, "read_cr3" }, + { SVM_EXIT_READ_CR4, "read_cr4" }, + { SVM_EXIT_READ_CR8, "read_cr8" }, + { SVM_EXIT_WRITE_CR0, "write_cr0" }, + { SVM_EXIT_WRITE_CR3, "write_cr3" }, + { SVM_EXIT_WRITE_CR4, "write_cr4" }, + { SVM_EXIT_WRITE_CR8, "write_cr8" }, + { SVM_EXIT_READ_DR0, "read_dr0" }, + { SVM_EXIT_READ_DR1, "read_dr1" }, + { SVM_EXIT_READ_DR2, "read_dr2" }, + { SVM_EXIT_READ_DR3, "read_dr3" }, + { SVM_EXIT_WRITE_DR0, "write_dr0" }, + { SVM_EXIT_WRITE_DR1, "write_dr1" }, + { SVM_EXIT_WRITE_DR2, "write_dr2" }, + { SVM_EXIT_WRITE_DR3, "write_dr3" }, + { SVM_EXIT_WRITE_DR5, "write_dr5" }, + { SVM_EXIT_WRITE_DR7, "write_dr7" }, + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, + { SVM_EXIT_INTR, "interrupt" }, + { SVM_EXIT_NMI, "nmi" }, + { SVM_EXIT_SMI, "smi" }, + { SVM_EXIT_INIT, "init" }, + { SVM_EXIT_VINTR, "vintr" }, + { SVM_EXIT_CPUID, "cpuid" }, + { SVM_EXIT_INVD, "invd" }, + { SVM_EXIT_HLT, "hlt" }, + { SVM_EXIT_INVLPG, "invlpg" }, + { SVM_EXIT_INVLPGA, "invlpga" }, + { SVM_EXIT_IOIO, "io" }, + { SVM_EXIT_MSR, "msr" }, + { SVM_EXIT_TASK_SWITCH, "task_switch" }, + { SVM_EXIT_SHUTDOWN, "shutdown" }, + { SVM_EXIT_VMRUN, "vmrun" }, + { SVM_EXIT_VMMCALL, "hypercall" }, + { SVM_EXIT_VMLOAD, "vmload" }, + { SVM_EXIT_VMSAVE, "vmsave" }, + { SVM_EXIT_STGI, "stgi" }, + { SVM_EXIT_CLGI, "clgi" }, + { SVM_EXIT_SKINIT, "skinit" }, + { SVM_EXIT_WBINVD, "wbinvd" }, + { SVM_EXIT_MONITOR, "monitor" }, + { SVM_EXIT_MWAIT, "mwait" }, + { SVM_EXIT_XSETBV, "xsetbv" }, + { SVM_EXIT_NPF, "npf" }, + { -1, NULL } +}; + +static int svm_get_lpage_level(void) +{ + return PT_PDPE_LEVEL; +} + +static bool svm_rdtscp_supported(void) +{ + return false; +} + +static bool svm_has_wbinvd_exit(void) +{ + return true; +} + +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + set_exception_intercept(svm, NM_VECTOR); + update_cr0_intercept(svm); +} + +static struct kvm_x86_ops svm_x86_ops = { + .cpu_has_kvm_support = has_svm, + .disabled_by_bios = is_disabled, + .hardware_setup = svm_hardware_setup, + .hardware_unsetup = svm_hardware_unsetup, + .check_processor_compatibility = svm_check_processor_compat, + .hardware_enable = svm_hardware_enable, + .hardware_disable = svm_hardware_disable, + .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, + + .vcpu_create = svm_create_vcpu, + .vcpu_free = svm_free_vcpu, + .vcpu_reset = svm_vcpu_reset, + + .prepare_guest_switch = svm_prepare_guest_switch, + .vcpu_load = svm_vcpu_load, + .vcpu_put = svm_vcpu_put, + + .set_guest_debug = svm_guest_debug, + .get_msr = svm_get_msr, + .set_msr = svm_set_msr, + .get_segment_base = svm_get_segment_base, + .get_segment = svm_get_segment, + .set_segment = svm_set_segment, + .get_cpl = svm_get_cpl, + .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, + .decache_cr3 = svm_decache_cr3, + .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, + .set_cr0 = svm_set_cr0, + .set_cr3 = svm_set_cr3, + .set_cr4 = svm_set_cr4, + .set_efer = svm_set_efer, + .get_idt = svm_get_idt, + .set_idt = svm_set_idt, + .get_gdt = svm_get_gdt, + .set_gdt = svm_set_gdt, + .set_dr7 = svm_set_dr7, + .cache_reg = svm_cache_reg, + .get_rflags = svm_get_rflags, + .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, + .fpu_deactivate = svm_fpu_deactivate, + + .tlb_flush = svm_flush_tlb, + + .run = svm_vcpu_run, + .handle_exit = handle_exit, + .skip_emulated_instruction = skip_emulated_instruction, + .set_interrupt_shadow = svm_set_interrupt_shadow, + .get_interrupt_shadow = svm_get_interrupt_shadow, + .patch_hypercall = svm_patch_hypercall, + .set_irq = svm_set_irq, + .set_nmi = svm_inject_nmi, + .queue_exception = svm_queue_exception, + .cancel_injection = svm_cancel_injection, + .interrupt_allowed = svm_interrupt_allowed, + .nmi_allowed = svm_nmi_allowed, + .get_nmi_mask = svm_get_nmi_mask, + .set_nmi_mask = svm_set_nmi_mask, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, + + .set_tss_addr = svm_set_tss_addr, + .get_tdp_level = get_npt_level, + .get_mt_mask = svm_get_mt_mask, + + .get_exit_info = svm_get_exit_info, + .exit_reasons_str = svm_exit_reasons_str, + + .get_lpage_level = svm_get_lpage_level, + + .cpuid_update = svm_cpuid_update, + + .rdtscp_supported = svm_rdtscp_supported, + + .set_supported_cpuid = svm_set_supported_cpuid, + + .has_wbinvd_exit = svm_has_wbinvd_exit, + + .write_tsc_offset = svm_write_tsc_offset, + .adjust_tsc_offset = svm_adjust_tsc_offset, + + .set_tdp_cr3 = set_tdp_cr3, +}; + +static int __init svm_init(void) +{ + return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), + __alignof__(struct vcpu_svm), THIS_MODULE); +} + +static void __exit svm_exit(void) +{ + kvm_exit(); +} + +module_init(svm_init) +module_exit(svm_exit) diff --git a/linux/x86/timer.c b/linux/x86/timer.c new file mode 100644 index 0000000..1fa2aa0 --- /dev/null +++ b/linux/x86/timer.c @@ -0,0 +1,105 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * timer support + * + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/hrtimer.h> +#include <asm/atomic.h> +#include "kvm_timer.h" + +static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) +{ + int restart_timer = 0; + wait_queue_head_t *q = &vcpu->wq; + + /* + * There is a race window between reading and incrementing, but we do + * not care about potentially loosing timer events in the !reinject + * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked + * in vcpu_enter_guest. + */ + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); + /* FIXME: this code should not know anything about vcpus */ + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + } + + if (waitqueue_active(q)) + wake_up_interruptible(q); + + if (ktimer->t_ops->is_periodic(ktimer)) { + kvm_hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + restart_timer = 1; + } + + return restart_timer; +} + +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) +{ + int restart_timer; + struct kvm_vcpu *vcpu; + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + + vcpu = ktimer->vcpu; + if (!vcpu) + return HRTIMER_NORESTART; + + restart_timer = __kvm_timer_fn(vcpu, ktimer); + if (restart_timer) + return HRTIMER_RESTART; + else + return HRTIMER_NORESTART; +} + diff --git a/linux/x86/trace.h b/linux/x86/trace.h new file mode 100644 index 0000000..1357d7c --- /dev/null +++ b/linux/x86/trace.h @@ -0,0 +1,709 @@ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +/* + * Tracepoint for guest mode entry. + */ +TRACE_EVENT(kvm_entry, + TP_PROTO(unsigned int vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("vcpu %u", __entry->vcpu_id) +); + +/* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hypercall, + TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3), + TP_ARGS(nr, a0, a1, a2, a3), + + TP_STRUCT__entry( + __field( unsigned long, nr ) + __field( unsigned long, a0 ) + __field( unsigned long, a1 ) + __field( unsigned long, a2 ) + __field( unsigned long, a3 ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->a0 = a0; + __entry->a1 = a1; + __entry->a2 = a2; + __entry->a3 = a3; + ), + + TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", + __entry->nr, __entry->a0, __entry->a1, __entry->a2, + __entry->a3) +); + +/* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hv_hypercall, + TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, + __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + + TP_STRUCT__entry( + __field( __u16, code ) + __field( bool, fast ) + __field( __u16, rep_cnt ) + __field( __u16, rep_idx ) + __field( __u64, ingpa ) + __field( __u64, outgpa ) + ), + + TP_fast_assign( + __entry->code = code; + __entry->fast = fast; + __entry->rep_cnt = rep_cnt; + __entry->rep_idx = rep_idx; + __entry->ingpa = ingpa; + __entry->outgpa = outgpa; + ), + + TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + __entry->code, __entry->fast ? "fast" : "slow", + __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, + __entry->outgpa) +); + +/* + * Tracepoint for PIO. + */ +TRACE_EVENT(kvm_pio, + TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, + unsigned int count), + TP_ARGS(rw, port, size, count), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, port ) + __field( unsigned int, size ) + __field( unsigned int, count ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->port = port; + __entry->size = size; + __entry->count = count; + ), + + TP_printk("pio_%s at 0x%x size %d count %d", + __entry->rw ? "write" : "read", + __entry->port, __entry->size, __entry->count) +); + +/* + * Tracepoint for cpuid. + */ +TRACE_EVENT(kvm_cpuid, + TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, + unsigned long rcx, unsigned long rdx), + TP_ARGS(function, rax, rbx, rcx, rdx), + + TP_STRUCT__entry( + __field( unsigned int, function ) + __field( unsigned long, rax ) + __field( unsigned long, rbx ) + __field( unsigned long, rcx ) + __field( unsigned long, rdx ) + ), + + TP_fast_assign( + __entry->function = function; + __entry->rax = rax; + __entry->rbx = rbx; + __entry->rcx = rcx; + __entry->rdx = rdx; + ), + + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", + __entry->function, __entry->rax, + __entry->rbx, __entry->rcx, __entry->rdx) +); + +#define AREG(x) { APIC_##x, "APIC_" #x } + +#define kvm_trace_symbol_apic \ + AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \ + AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \ + AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \ + AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \ + AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \ + AREG(ECTRL) +/* + * Tracepoint for apic access. + */ +TRACE_EVENT(kvm_apic, + TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), + TP_ARGS(rw, reg, val), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, reg ) + __field( unsigned int, val ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->reg = reg; + __entry->val = val; + ), + + TP_printk("apic_%s %s = 0x%x", + __entry->rw ? "write" : "read", + __print_symbolic(__entry->reg, kvm_trace_symbol_apic), + __entry->val) +); + +#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) +#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) + +#define KVM_ISA_VMX 1 +#define KVM_ISA_SVM 2 + +/* + * Tracepoint for kvm guest exit: + */ +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), + TP_ARGS(exit_reason, vcpu, isa), + + TP_STRUCT__entry( + __field( unsigned int, exit_reason ) + __field( unsigned long, guest_rip ) + __field( u32, isa ) + __field( u64, info1 ) + __field( u64, info2 ) + ), + + TP_fast_assign( + __entry->exit_reason = exit_reason; + __entry->guest_rip = kvm_rip_read(vcpu); + __entry->isa = isa; + kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, + &__entry->info2); + ), + + TP_printk("reason %s rip 0x%lx info %llx %llx", + ftrace_print_symbols_seq(p, __entry->exit_reason, + kvm_x86_ops->exit_reasons_str), + __entry->guest_rip, __entry->info1, __entry->info2) +); + +/* + * Tracepoint for kvm interrupt injection: + */ +TRACE_EVENT(kvm_inj_virq, + TP_PROTO(unsigned int irq), + TP_ARGS(irq), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + ), + + TP_fast_assign( + __entry->irq = irq; + ), + + TP_printk("irq %u", __entry->irq) +); + +#define EXS(x) { x##_VECTOR, "#" #x } + +#define kvm_trace_sym_exc \ + EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ + EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ + EXS(MF), EXS(MC) + +/* + * Tracepoint for kvm interrupt injection: + */ +TRACE_EVENT(kvm_inj_exception, + TP_PROTO(unsigned exception, bool has_error, unsigned error_code), + TP_ARGS(exception, has_error, error_code), + + TP_STRUCT__entry( + __field( u8, exception ) + __field( u8, has_error ) + __field( u32, error_code ) + ), + + TP_fast_assign( + __entry->exception = exception; + __entry->has_error = has_error; + __entry->error_code = error_code; + ), + + TP_printk("%s (0x%x)", + __print_symbolic(__entry->exception, kvm_trace_sym_exc), + /* FIXME: don't print error_code if not present */ + __entry->has_error ? __entry->error_code : 0) +); + +/* + * Tracepoint for page fault. + */ +TRACE_EVENT(kvm_page_fault, + TP_PROTO(unsigned long fault_address, unsigned int error_code), + TP_ARGS(fault_address, error_code), + + TP_STRUCT__entry( + __field( unsigned long, fault_address ) + __field( unsigned int, error_code ) + ), + + TP_fast_assign( + __entry->fault_address = fault_address; + __entry->error_code = error_code; + ), + + TP_printk("address %lx error_code %x", + __entry->fault_address, __entry->error_code) +); + +/* + * Tracepoint for guest MSR access. + */ +TRACE_EVENT(kvm_msr, + TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), + TP_ARGS(write, ecx, data, exception), + + TP_STRUCT__entry( + __field( unsigned, write ) + __field( u32, ecx ) + __field( u64, data ) + __field( u8, exception ) + ), + + TP_fast_assign( + __entry->write = write; + __entry->ecx = ecx; + __entry->data = data; + __entry->exception = exception; + ), + + TP_printk("msr_%s %x = 0x%llx%s", + __entry->write ? "write" : "read", + __entry->ecx, __entry->data, + __entry->exception ? " (#GP)" : "") +); + +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) +#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) +#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) + +/* + * Tracepoint for guest CR access. + */ +TRACE_EVENT(kvm_cr, + TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), + TP_ARGS(rw, cr, val), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, cr ) + __field( unsigned long, val ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->cr = cr; + __entry->val = val; + ), + + TP_printk("cr_%s %x = 0x%lx", + __entry->rw ? "write" : "read", + __entry->cr, __entry->val) +); + +#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) +#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) + +TRACE_EVENT(kvm_pic_set_irq, + TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced), + TP_ARGS(chip, pin, elcr, imr, coalesced), + + TP_STRUCT__entry( + __field( __u8, chip ) + __field( __u8, pin ) + __field( __u8, elcr ) + __field( __u8, imr ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->chip = chip; + __entry->pin = pin; + __entry->elcr = elcr; + __entry->imr = imr; + __entry->coalesced = coalesced; + ), + + TP_printk("chip %u pin %u (%s%s)%s", + __entry->chip, __entry->pin, + (__entry->elcr & (1 << __entry->pin)) ? "level":"edge", + (__entry->imr & (1 << __entry->pin)) ? "|masked":"", + __entry->coalesced ? " (coalesced)" : "") +); + +#define kvm_apic_dst_shorthand \ + {0x0, "dst"}, \ + {0x1, "self"}, \ + {0x2, "all"}, \ + {0x3, "all-but-self"} + +TRACE_EVENT(kvm_apic_ipi, + TP_PROTO(__u32 icr_low, __u32 dest_id), + TP_ARGS(icr_low, dest_id), + + TP_STRUCT__entry( + __field( __u32, icr_low ) + __field( __u32, dest_id ) + ), + + TP_fast_assign( + __entry->icr_low = icr_low; + __entry->dest_id = dest_id; + ), + + TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)", + __entry->dest_id, (u8)__entry->icr_low, + __print_symbolic((__entry->icr_low >> 8 & 0x7), + kvm_deliver_mode), + (__entry->icr_low & (1<<11)) ? "logical" : "physical", + (__entry->icr_low & (1<<14)) ? "assert" : "de-assert", + (__entry->icr_low & (1<<15)) ? "level" : "edge", + __print_symbolic((__entry->icr_low >> 18 & 0x3), + kvm_apic_dst_shorthand)) +); + +TRACE_EVENT(kvm_apic_accept_irq, + TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced), + TP_ARGS(apicid, dm, tm, vec, coalesced), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( __u16, dm ) + __field( __u8, tm ) + __field( __u8, vec ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->apicid = apicid; + __entry->dm = dm; + __entry->tm = tm; + __entry->vec = vec; + __entry->coalesced = coalesced; + ), + + TP_printk("apicid %x vec %u (%s|%s)%s", + __entry->apicid, __entry->vec, + __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), + __entry->tm ? "level" : "edge", + __entry->coalesced ? " (coalesced)" : "") +); + +/* + * Tracepoint for nested VMRUN + */ +TRACE_EVENT(kvm_nested_vmrun, + TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl, + __u32 event_inj, bool npt), + TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( __u64, vmcb ) + __field( __u64, nested_rip ) + __field( __u32, int_ctl ) + __field( __u32, event_inj ) + __field( bool, npt ) + ), + + TP_fast_assign( + __entry->rip = rip; + __entry->vmcb = vmcb; + __entry->nested_rip = nested_rip; + __entry->int_ctl = int_ctl; + __entry->event_inj = event_inj; + __entry->npt = npt; + ), + + TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " + "event_inj: 0x%08x npt: %s", + __entry->rip, __entry->vmcb, __entry->nested_rip, + __entry->int_ctl, __entry->event_inj, + __entry->npt ? "on" : "off") +); + +TRACE_EVENT(kvm_nested_intercepts, + TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), + TP_ARGS(cr_read, cr_write, exceptions, intercept), + + TP_STRUCT__entry( + __field( __u16, cr_read ) + __field( __u16, cr_write ) + __field( __u32, exceptions ) + __field( __u64, intercept ) + ), + + TP_fast_assign( + __entry->cr_read = cr_read; + __entry->cr_write = cr_write; + __entry->exceptions = exceptions; + __entry->intercept = intercept; + ), + + TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", + __entry->cr_read, __entry->cr_write, __entry->exceptions, + __entry->intercept) +); +/* + * Tracepoint for #VMEXIT while nested + */ +TRACE_EVENT(kvm_nested_vmexit, + TP_PROTO(__u64 rip, __u32 exit_code, + __u64 exit_info1, __u64 exit_info2, + __u32 exit_int_info, __u32 exit_int_info_err), + TP_ARGS(rip, exit_code, exit_info1, exit_info2, + exit_int_info, exit_int_info_err), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( __u32, exit_code ) + __field( __u64, exit_info1 ) + __field( __u64, exit_info2 ) + __field( __u32, exit_int_info ) + __field( __u32, exit_int_info_err ) + ), + + TP_fast_assign( + __entry->rip = rip; + __entry->exit_code = exit_code; + __entry->exit_info1 = exit_info1; + __entry->exit_info2 = exit_info2; + __entry->exit_int_info = exit_int_info; + __entry->exit_int_info_err = exit_int_info_err; + ), + TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " + "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", + __entry->rip, + ftrace_print_symbols_seq(p, __entry->exit_code, + kvm_x86_ops->exit_reasons_str), + __entry->exit_info1, __entry->exit_info2, + __entry->exit_int_info, __entry->exit_int_info_err) +); + +/* + * Tracepoint for #VMEXIT reinjected to the guest + */ +TRACE_EVENT(kvm_nested_vmexit_inject, + TP_PROTO(__u32 exit_code, + __u64 exit_info1, __u64 exit_info2, + __u32 exit_int_info, __u32 exit_int_info_err), + TP_ARGS(exit_code, exit_info1, exit_info2, + exit_int_info, exit_int_info_err), + + TP_STRUCT__entry( + __field( __u32, exit_code ) + __field( __u64, exit_info1 ) + __field( __u64, exit_info2 ) + __field( __u32, exit_int_info ) + __field( __u32, exit_int_info_err ) + ), + + TP_fast_assign( + __entry->exit_code = exit_code; + __entry->exit_info1 = exit_info1; + __entry->exit_info2 = exit_info2; + __entry->exit_int_info = exit_int_info; + __entry->exit_int_info_err = exit_int_info_err; + ), + + TP_printk("reason: %s ext_inf1: 0x%016llx " + "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", + ftrace_print_symbols_seq(p, __entry->exit_code, + kvm_x86_ops->exit_reasons_str), + __entry->exit_info1, __entry->exit_info2, + __entry->exit_int_info, __entry->exit_int_info_err) +); + +/* + * Tracepoint for nested #vmexit because of interrupt pending + */ +TRACE_EVENT(kvm_nested_intr_vmexit, + TP_PROTO(__u64 rip), + TP_ARGS(rip), + + TP_STRUCT__entry( + __field( __u64, rip ) + ), + + TP_fast_assign( + __entry->rip = rip + ), + + TP_printk("rip: 0x%016llx", __entry->rip) +); + +/* + * Tracepoint for nested #vmexit because of interrupt pending + */ +TRACE_EVENT(kvm_invlpga, + TP_PROTO(__u64 rip, int asid, u64 address), + TP_ARGS(rip, asid, address), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( int, asid ) + __field( __u64, address ) + ), + + TP_fast_assign( + __entry->rip = rip; + __entry->asid = asid; + __entry->address = address; + ), + + TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", + __entry->rip, __entry->asid, __entry->address) +); + +/* + * Tracepoint for nested #vmexit because of interrupt pending + */ +TRACE_EVENT(kvm_skinit, + TP_PROTO(__u64 rip, __u32 slb), + TP_ARGS(rip, slb), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( __u32, slb ) + ), + + TP_fast_assign( + __entry->rip = rip; + __entry->slb = slb; + ), + + TP_printk("rip: 0x%016llx slb: 0x%08x", + __entry->rip, __entry->slb) +); + +#define __print_insn(insn, ilen) ({ \ + int i; \ + const char *ret = p->buffer + p->len; \ + \ + for (i = 0; i < ilen; ++i) \ + trace_seq_printf(p, " %02x", insn[i]); \ + trace_seq_printf(p, "%c", 0); \ + ret; \ + }) + +#define KVM_EMUL_INSN_F_CR0_PE (1 << 0) +#define KVM_EMUL_INSN_F_EFL_VM (1 << 1) +#define KVM_EMUL_INSN_F_CS_D (1 << 2) +#define KVM_EMUL_INSN_F_CS_L (1 << 3) + +#define kvm_trace_symbol_emul_flags \ + { 0, "real" }, \ + { KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \ + { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \ + { KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_D, "prot32" }, \ + { KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_L, "prot64" } + +#define kei_decode_mode(mode) ({ \ + u8 flags = 0xff; \ + switch (mode) { \ + case X86EMUL_MODE_REAL: \ + flags = 0; \ + break; \ + case X86EMUL_MODE_VM86: \ + flags = KVM_EMUL_INSN_F_EFL_VM; \ + break; \ + case X86EMUL_MODE_PROT16: \ + flags = KVM_EMUL_INSN_F_CR0_PE; \ + break; \ + case X86EMUL_MODE_PROT32: \ + flags = KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_D; \ + break; \ + case X86EMUL_MODE_PROT64: \ + flags = KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_L; \ + break; \ + } \ + flags; \ + }) + +TRACE_EVENT(kvm_emulate_insn, + TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed), + TP_ARGS(vcpu, failed), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( __u32, csbase ) + __field( __u8, len ) + __array( __u8, insn, 15 ) + __field( __u8, flags ) + __field( __u8, failed ) + ), + + TP_fast_assign( + __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; + __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); + __entry->len = vcpu->arch.emulate_ctxt.decode.eip + - vcpu->arch.emulate_ctxt.decode.fetch.start; + memcpy(__entry->insn, + vcpu->arch.emulate_ctxt.decode.fetch.data, + 15); + __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); + __entry->failed = failed; + ), + + TP_printk("%x:%llx:%s (%s)%s", + __entry->csbase, __entry->rip, + __print_insn(__entry->insn, __entry->len), + __print_symbolic(__entry->flags, + kvm_trace_symbol_emul_flags), + __entry->failed ? " failed" : "" + ) + ); + +#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) +#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) + +#endif /* _TRACE_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH arch/x86/kvm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/linux/x86/tss.h b/linux/x86/tss.h new file mode 100644 index 0000000..622aa10 --- /dev/null +++ b/linux/x86/tss.h @@ -0,0 +1,59 @@ +#ifndef __TSS_SEGMENT_H +#define __TSS_SEGMENT_H + +struct tss_segment_32 { + u32 prev_task_link; + u32 esp0; + u32 ss0; + u32 esp1; + u32 ss1; + u32 esp2; + u32 ss2; + u32 cr3; + u32 eip; + u32 eflags; + u32 eax; + u32 ecx; + u32 edx; + u32 ebx; + u32 esp; + u32 ebp; + u32 esi; + u32 edi; + u32 es; + u32 cs; + u32 ss; + u32 ds; + u32 fs; + u32 gs; + u32 ldt_selector; + u16 t; + u16 io_map; +}; + +struct tss_segment_16 { + u16 prev_task_link; + u16 sp0; + u16 ss0; + u16 sp1; + u16 ss1; + u16 sp2; + u16 ss2; + u16 ip; + u16 flag; + u16 ax; + u16 cx; + u16 dx; + u16 bx; + u16 sp; + u16 bp; + u16 si; + u16 di; + u16 es; + u16 cs; + u16 ss; + u16 ds; + u16 ldt; +}; + +#endif diff --git a/linux/x86/vmx.c b/linux/x86/vmx.c new file mode 100644 index 0000000..ed4cf97 --- /dev/null +++ b/linux/x86/vmx.c @@ -0,0 +1,4549 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "irq.h" +#include "mmu.h" + +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/sched.h> +#include <linux/moduleparam.h> +#include <linux/ftrace_event.h> +#include <linux/slab.h> +#include <linux/tboot.h> +#include "kvm_cache_regs.h" +#include "x86.h" + +#include <asm/io.h> +#include <asm/desc.h> +#include <asm/vmx.h> +#include <asm/virtext.h> +#include <asm/mce.h> +#include <asm/i387.h> +#include <asm/xcr.h> + +#include "trace.h" + +#define __ex(x) __kvm_handle_fault_on_reboot(x) + +MODULE_INFO(version, "kvm-kmod-2.6.38-rc7"); +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +static int __read_mostly bypass_guest_pf = 1; +module_param(bypass_guest_pf, bool, S_IRUGO); + +static int __read_mostly enable_vpid = 1; +module_param_named(vpid, enable_vpid, bool, 0444); + +static int __read_mostly flexpriority_enabled = 1; +module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); + +static int __read_mostly enable_ept = 1; +module_param_named(ept, enable_ept, bool, S_IRUGO); + +static int __read_mostly enable_unrestricted_guest = 1; +module_param_named(unrestricted_guest, + enable_unrestricted_guest, bool, S_IRUGO); + +static int __read_mostly emulate_invalid_guest_state = 0; +module_param(emulate_invalid_guest_state, bool, S_IRUGO); + +static int __read_mostly vmm_exclusive = 1; +module_param(vmm_exclusive, bool, S_IRUGO); + +static int __read_mostly yield_on_hlt = 1; +module_param(yield_on_hlt, bool, S_IRUGO); + +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE) +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT) + +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + +/* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * ple_gap: upper bound on the amount of time between two successive + * executions of PAUSE in a loop. Also indicate if ple enabled. + * According to test, this time is usually small than 41 cycles. + * ple_window: upper bound on the amount of time a guest is allowed to execute + * in a PAUSE loop. Tests indicate that most spinlocks are held for + * less than 2^12 cycles + * Time is measured based on a counter that runs at the same rate as the TSC, + * refer SDM volume 3b section 21.6.13 & 22.1.3. + */ +#define KVM_VMX_DEFAULT_PLE_GAP 41 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; +module_param(ple_gap, int, S_IRUGO); + +static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, int, S_IRUGO); + +#define NR_AUTOLOAD_MSRS 1 + +struct vmcs { + u32 revision_id; + u32 abort; + char data[0]; +}; + +struct shared_msr_entry { + unsigned index; + u64 data; + u64 mask; +}; + +struct vcpu_vmx { + struct kvm_vcpu vcpu; + struct list_head local_vcpus_link; + unsigned long host_rsp; + int launched; + u8 fail; + u32 exit_intr_info; + u32 idt_vectoring_info; + struct shared_msr_entry *guest_msrs; + int nmsrs; + int save_nmsrs; +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; +#endif + struct vmcs *vmcs; + struct msr_autoload { + unsigned nr; + struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; + struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + } msr_autoload; + struct { + int loaded; + u16 fs_sel, gs_sel, ldt_sel; + int gs_ldt_reload_needed; + int fs_reload_needed; + } host_state; + struct { + int vm86_active; + ulong save_rflags; + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } tr, es, ds, fs, gs; + } rmode; + int vpid; + bool emulation_required; + + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; + u32 exit_reason; + + bool rdtscp_enabled; +}; + +static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_vmx, vcpu); +} + +static int init_rmode(struct kvm *kvm); +static u64 construct_eptp(unsigned long root_hpa); +static void kvm_cpu_vmxon(u64 addr); +static void kvm_cpu_vmxoff(void); +static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); + +static DEFINE_PER_CPU(struct vmcs *, vmxarea); +static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); +static DEFINE_PER_CPU(struct kvm_desc_ptr, host_gdt); + +static unsigned long *vmx_io_bitmap_a; +static unsigned long *vmx_io_bitmap_b; +static unsigned long *vmx_msr_bitmap_legacy; +static unsigned long *vmx_msr_bitmap_longmode; + +static bool cpu_has_load_ia32_efer; + +static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); +static DEFINE_SPINLOCK(vmx_vpid_lock); + +static struct vmcs_config { + int size; + int order; + u32 revision_id; + u32 pin_based_exec_ctrl; + u32 cpu_based_exec_ctrl; + u32 cpu_based_2nd_exec_ctrl; + u32 vmexit_ctrl; + u32 vmentry_ctrl; +} vmcs_config; + +static struct vmx_capability { + u32 ept; + u32 vpid; +} vmx_capability; + +#define VMX_SEGMENT_FIELD(seg) \ + [VCPU_SREG_##seg] = { \ + .selector = GUEST_##seg##_SELECTOR, \ + .base = GUEST_##seg##_BASE, \ + .limit = GUEST_##seg##_LIMIT, \ + .ar_bytes = GUEST_##seg##_AR_BYTES, \ + } + +static struct kvm_vmx_segment_field { + unsigned selector; + unsigned base; + unsigned limit; + unsigned ar_bytes; +} kvm_vmx_segment_fields[] = { + VMX_SEGMENT_FIELD(CS), + VMX_SEGMENT_FIELD(DS), + VMX_SEGMENT_FIELD(ES), + VMX_SEGMENT_FIELD(FS), + VMX_SEGMENT_FIELD(GS), + VMX_SEGMENT_FIELD(SS), + VMX_SEGMENT_FIELD(TR), + VMX_SEGMENT_FIELD(LDTR), +}; + +static u64 host_efer; + +static void ept_save_pdptrs(struct kvm_vcpu *vcpu); + +/* + * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it + * away by decrementing the array size. + */ +static const u32 vmx_msr_index[] = { +#ifdef CONFIG_X86_64 + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, +#endif + MSR_EFER, MSR_TSC_AUX, MSR_STAR, +}; +#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) + +static inline bool is_page_fault(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline bool is_no_device(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline bool is_invalid_opcode(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline bool is_external_interrupt(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); +} + +static inline bool is_machine_check(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline bool cpu_has_vmx_msr_bitmap(void) +{ + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; +} + +static inline bool cpu_has_vmx_tpr_shadow(void) +{ + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; +} + +static inline bool vm_need_tpr_shadow(struct kvm *kvm) +{ + return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); +} + +static inline bool cpu_has_secondary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; +} + +static inline bool cpu_has_vmx_virtualize_apic_accesses(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +} + +static inline bool cpu_has_vmx_flexpriority(void) +{ + return cpu_has_vmx_tpr_shadow() && + cpu_has_vmx_virtualize_apic_accesses(); +} + +static inline bool cpu_has_vmx_ept_execute_only(void) +{ + return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; +} + +static inline bool cpu_has_vmx_eptp_uncacheable(void) +{ + return vmx_capability.ept & VMX_EPTP_UC_BIT; +} + +static inline bool cpu_has_vmx_eptp_writeback(void) +{ + return vmx_capability.ept & VMX_EPTP_WB_BIT; +} + +static inline bool cpu_has_vmx_ept_2m_page(void) +{ + return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; +} + +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; +} + +static inline bool cpu_has_vmx_ept_4levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; +} + +static inline bool cpu_has_vmx_invept_individual_addr(void) +{ + return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; +} + +static inline bool cpu_has_vmx_invept_context(void) +{ + return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invept_global(void) +{ + return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; +} + +static inline bool cpu_has_vmx_invvpid_single(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invvpid_global(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_ept(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_EPT; +} + +static inline bool cpu_has_vmx_unrestricted_guest(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_UNRESTRICTED_GUEST; +} + +static inline bool cpu_has_vmx_ple(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PAUSE_LOOP_EXITING; +} + +static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) +{ + return flexpriority_enabled && irqchip_in_kernel(kvm); +} + +static inline bool cpu_has_vmx_vpid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID; +} + +static inline bool cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + +static inline bool cpu_has_virtual_nmis(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; +} + +static inline bool cpu_has_vmx_wbinvd_exit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_WBINVD_EXITING; +} + +static inline bool report_flexpriority(void) +{ + return flexpriority_enabled; +} + +static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) +{ + int i; + + for (i = 0; i < vmx->nmsrs; ++i) + if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) + return i; + return -1; +} + +static inline void __invvpid(int ext, u16 vpid, gva_t gva) +{ + struct { + u64 vpid : 16; + u64 rsvd : 48; + u64 gva; + } operand = { vpid, 0, gva }; + + asm volatile (__ex(ASM_VMX_INVVPID) + /* CF==1 or ZF==1 --> rc = -1 */ + "; ja 1f ; ud2 ; 1:" + : : "a"(&operand), "c"(ext) : "cc", "memory"); +} + +static inline void __invept(int ext, u64 eptp, gpa_t gpa) +{ + struct { + u64 eptp, gpa; + } operand = {eptp, gpa}; + + asm volatile (__ex(ASM_VMX_INVEPT) + /* CF==1 or ZF==1 --> rc = -1 */ + "; ja 1f ; ud2 ; 1:\n" + : : "a" (&operand), "c" (ext) : "cc", "memory"); +} + +static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +{ + int i; + + i = __find_msr_index(vmx, msr); + if (i >= 0) + return &vmx->guest_msrs[i]; + return NULL; +} + +static void vmcs_clear(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + u8 error; + + asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" + : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc", "memory"); + if (error) + printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", + vmcs, phys_addr); +} + +static void vmcs_load(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + u8 error; + + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" + : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc", "memory"); + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + vmcs, phys_addr); +} + +static void __vcpu_clear(void *arg) +{ + struct vcpu_vmx *vmx = arg; + int cpu = raw_smp_processor_id(); + + if (vmx->vcpu.cpu == cpu) + vmcs_clear(vmx->vmcs); + if (per_cpu(current_vmcs, cpu) == vmx->vmcs) + per_cpu(current_vmcs, cpu) = NULL; + list_del(&vmx->local_vcpus_link); + vmx->vcpu.cpu = -1; + vmx->launched = 0; +} + +static void vcpu_clear(struct vcpu_vmx *vmx) +{ + if (vmx->vcpu.cpu == -1) + return; + smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); +} + +static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) +{ + if (vmx->vpid == 0) + return; + + if (cpu_has_vmx_invvpid_single()) + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); +} + +static inline void vpid_sync_vcpu_global(void) +{ + if (cpu_has_vmx_invvpid_global()) + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); +} + +static inline void vpid_sync_context(struct vcpu_vmx *vmx) +{ + if (cpu_has_vmx_invvpid_single()) + vpid_sync_vcpu_single(vmx); + else + vpid_sync_vcpu_global(); +} + +static inline void ept_sync_global(void) +{ + if (cpu_has_vmx_invept_global()) + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); +} + +static inline void ept_sync_context(u64 eptp) +{ + if (enable_ept) { + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + else + ept_sync_global(); + } +} + +static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) +{ + if (enable_ept) { + if (cpu_has_vmx_invept_individual_addr()) + __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, + eptp, gpa); + else + ept_sync_context(eptp); + } +} + +static unsigned long vmcs_readl(unsigned long field) +{ + unsigned long value = 0; + + asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) + : "+a"(value) : "d"(field) : "cc"); + return value; +} + +static u16 vmcs_read16(unsigned long field) +{ + return vmcs_readl(field); +} + +static u32 vmcs_read32(unsigned long field) +{ + return vmcs_readl(field); +} + +static u64 vmcs_read64(unsigned long field) +{ +#ifdef CONFIG_X86_64 + return vmcs_readl(field); +#else + return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); +#endif +} + +static noinline void vmwrite_error(unsigned long field, unsigned long value) +{ + printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); + dump_stack(); +} + +static void vmcs_writel(unsigned long field, unsigned long value) +{ + u8 error; + + asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" + : "=q"(error) : "a"(value), "d"(field) : "cc"); + if (unlikely(error)) + vmwrite_error(field, value); +} + +static void vmcs_write16(unsigned long field, u16 value) +{ + vmcs_writel(field, value); +} + +static void vmcs_write32(unsigned long field, u32 value) +{ + vmcs_writel(field, value); +} + +static void vmcs_write64(unsigned long field, u64 value) +{ + vmcs_writel(field, value); +#ifndef CONFIG_X86_64 + asm volatile (""); + vmcs_writel(field+1, value >> 32); +#endif +} + +static void vmcs_clear_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) & ~mask); +} + +static void vmcs_set_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) | mask); +} + +static void update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + u32 eb; + + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + (1u << NM_VECTOR) | (1u << DB_VECTOR); + if ((vcpu->guest_debug & + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + eb |= 1u << BP_VECTOR; + if (to_vmx(vcpu)->rmode.vm86_active) + eb = ~0; + if (enable_ept) + eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + if (vcpu->fpu_active) + eb &= ~(1u << NM_VECTOR); + vmcs_write32(EXCEPTION_BITMAP, eb); +} + +static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +{ + unsigned i; + struct msr_autoload *m = &vmx->msr_autoload; + + if (msr == MSR_EFER && cpu_has_load_ia32_efer) { + vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); + vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); + return; + } + + for (i = 0; i < m->nr; ++i) + if (m->guest[i].index == msr) + break; + + if (i == m->nr) + return; + --m->nr; + m->guest[i] = m->guest[m->nr]; + m->host[i] = m->host[m->nr]; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); +} + +static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, + u64 guest_val, u64 host_val) +{ + unsigned i; + struct msr_autoload *m = &vmx->msr_autoload; + + if (msr == MSR_EFER && cpu_has_load_ia32_efer) { + vmcs_write64(GUEST_IA32_EFER, guest_val); + vmcs_write64(HOST_IA32_EFER, host_val); + vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); + vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); + return; + } + + for (i = 0; i < m->nr; ++i) + if (m->guest[i].index == msr) + break; + + if (i == m->nr) { + ++m->nr; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + } + + m->guest[i].index = msr; + m->guest[i].value = guest_val; + m->host[i].index = msr; + m->host[i].value = host_val; +} + +static void reload_tss(void) +{ + /* + * VT restores TR but not its size. Useless. + */ + struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct kvm_desc_struct *descs; + + descs = (void *)gdt->address; + descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ + load_TR_desc(); +} + +static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) +{ + u64 guest_efer; + u64 ignore_bits; + + guest_efer = vmx->vcpu.arch.efer; + + /* + * NX is emulated; LMA and LME handled by hardware; SCE meaninless + * outside long mode + */ + ignore_bits = EFER_NX | EFER_SCE; +#ifdef CONFIG_X86_64 + ignore_bits |= EFER_LMA | EFER_LME; + /* SCE is meaningful only in long mode on Intel */ + if (guest_efer & EFER_LMA) + ignore_bits &= ~(u64)EFER_SCE; +#endif + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; + vmx->guest_msrs[efer_offset].data = guest_efer; + vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + + clear_atomic_switch_msr(vmx, MSR_EFER); + /* On ept, can't emulate nx, and must switch nx atomically */ + if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) { + guest_efer = vmx->vcpu.arch.efer; + if (!(guest_efer & EFER_LMA)) + guest_efer &= ~EFER_LME; + add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); + return false; + } + + return true; +} + +static unsigned long segment_base(u16 selector) +{ + struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct kvm_desc_struct *d; + unsigned long table_base; + unsigned long v; + + if (!(selector & ~3)) + return 0; + + table_base = gdt->address; + + if (selector & 4) { /* from ldt */ + u16 ldt_selector = kvm_read_ldt(); + + if (!(ldt_selector & ~3)) + return 0; + + table_base = segment_base(ldt_selector); + } + d = (struct kvm_desc_struct *)(table_base + (selector & ~7)); + v = kvm_get_desc_base(d); +#ifdef CONFIG_X86_64 + if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((unsigned long)((struct kvm_ldttss_desc64 *)d)->base3) << 32; +#endif + return v; +} + +static inline unsigned long kvm_read_tr_base(void) +{ + u16 tr; + asm("str %0" : "=g"(tr)); + return segment_base(tr); +} + +static void vmx_save_host_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int i; + + if (vmx->host_state.loaded) + return; + + vmx->host_state.loaded = 1; + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + vmx->host_state.ldt_sel = kvm_read_ldt(); + vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; + savesegment(fs, vmx->host_state.fs_sel); + if (!(vmx->host_state.fs_sel & 7)) { + vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); + vmx->host_state.fs_reload_needed = 0; + } else { + vmcs_write16(HOST_FS_SELECTOR, 0); + vmx->host_state.fs_reload_needed = 1; + } + savesegment(gs, vmx->host_state.gs_sel); + if (!(vmx->host_state.gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); + else { + vmcs_write16(HOST_GS_SELECTOR, 0); + vmx->host_state.gs_ldt_reload_needed = 1; + } + +#ifdef CONFIG_X86_64 + vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); +#else + vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); +#endif + +#ifdef CONFIG_X86_64 + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + if (is_long_mode(&vmx->vcpu)) + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#endif + for (i = 0; i < vmx->save_nmsrs; ++i) + kvm_set_shared_msr(vmx->guest_msrs[i].index, + vmx->guest_msrs[i].data, + vmx->guest_msrs[i].mask); +} + +static void __vmx_load_host_state(struct vcpu_vmx *vmx) +{ + if (!vmx->host_state.loaded) + return; + + ++vmx->vcpu.stat.host_state_reload; + vmx->host_state.loaded = 0; +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#endif + if (vmx->host_state.gs_ldt_reload_needed) { + kvm_load_ldt(vmx->host_state.ldt_sel); +#ifdef CONFIG_X86_64 + load_gs_index(vmx->host_state.gs_sel); +#else + loadsegment(gs, vmx->host_state.gs_sel); +#endif + } + if (vmx->host_state.fs_reload_needed) + loadsegment(fs, vmx->host_state.fs_sel); + reload_tss(); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); +#endif + if (current_thread_info()->status & TS_USEDFPU) + clts(); + kvm_load_gdt(&__get_cpu_var(host_gdt)); +} + +static void vmx_load_host_state(struct vcpu_vmx *vmx) +{ + preempt_disable(); + __vmx_load_host_state(vmx); + preempt_enable(); +} + +/* + * Switches to specified vcpu, until a matching vcpu_put(), but assumes + * vcpu mutex is already taken. + */ +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + + if (!vmm_exclusive) + kvm_cpu_vmxon(phys_addr); + else if (vcpu->cpu != cpu) + vcpu_clear(vmx); + + if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->vmcs; + vmcs_load(vmx->vmcs); + } + + if (vcpu->cpu != cpu) { + struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt); + unsigned long sysenter_esp; + + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + local_irq_disable(); + list_add(&vmx->local_vcpus_link, + &per_cpu(vcpus_on_cpu, cpu)); + local_irq_enable(); + + /* + * Linux uses per-cpu TSS and GDT, so set these when switching + * processors. + */ + vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ + vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ + + rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); + vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + } +} + +static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +{ + __vmx_load_host_state(to_vmx(vcpu)); + if (!vmm_exclusive) { + __vcpu_clear(to_vmx(vcpu)); + kvm_cpu_vmxoff(); + } +} + +static void vmx_fpu_activate(struct kvm_vcpu *vcpu) +{ + ulong cr0; + + if (vcpu->fpu_active) + return; + vcpu->fpu_active = 1; + cr0 = vmcs_readl(GUEST_CR0); + cr0 &= ~(X86_CR0_TS | X86_CR0_MP); + cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); + vmcs_writel(GUEST_CR0, cr0); + update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); +} + +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); + +static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + vmx_decache_cr0_guest_bits(vcpu); + vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); + update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = 0; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); +} + +static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) +{ + unsigned long rflags, save_rflags; + + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } + return rflags; +} + +static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (to_vmx(vcpu)->rmode.vm86_active) { + to_vmx(vcpu)->rmode.save_rflags = rflags; + rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } + vmcs_writel(GUEST_RFLAGS, rflags); +} + +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + int ret = 0; + + if (interruptibility & GUEST_INTR_STATE_STI) + ret |= KVM_X86_SHADOW_INT_STI; + if (interruptibility & GUEST_INTR_STATE_MOV_SS) + ret |= KVM_X86_SHADOW_INT_MOV_SS; + + return ret & mask; +} + +static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + u32 interruptibility = interruptibility_old; + + interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); + + if (mask & KVM_X86_SHADOW_INT_MOV_SS) + interruptibility |= GUEST_INTR_STATE_MOV_SS; + else if (mask & KVM_X86_SHADOW_INT_STI) + interruptibility |= GUEST_INTR_STATE_STI; + + if ((interruptibility != interruptibility_old)) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); +} + +static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rip; + + rip = kvm_rip_read(vcpu); + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + kvm_rip_write(vcpu, rip); + + /* skipping an emulated instruction also counts */ + vmx_set_interrupt_shadow(vcpu, 0); +} + +static void vmx_clear_hlt(struct kvm_vcpu *vcpu) +{ + /* Ensure that we clear the HLT state in the VMCS. We don't need to + * explicitly skip the instruction because if the HLT state is set, then + * the instruction is already executing and RIP has already been + * advanced. */ + if (!yield_on_hlt && + vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); +} + +static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code, + bool reinject) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 intr_info = nr | INTR_INFO_VALID_MASK; + + if (has_error_code) { + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } + + if (vmx->rmode.vm86_active) { + if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return; + } + + if (kvm_exception_is_soft(nr)) { + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + } else + intr_info |= INTR_TYPE_HARD_EXCEPTION; + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); + vmx_clear_hlt(vcpu); +} + +static bool vmx_rdtscp_supported(void) +{ + return cpu_has_vmx_rdtscp(); +} + +/* + * Swap MSR entry in host/guest MSR entry array. + */ +static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) +{ + struct shared_msr_entry tmp; + + tmp = vmx->guest_msrs[to]; + vmx->guest_msrs[to] = vmx->guest_msrs[from]; + vmx->guest_msrs[from] = tmp; +} + +/* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy + * mode, as fiddling with msrs is very expensive. + */ +static void setup_msrs(struct vcpu_vmx *vmx) +{ + int save_nmsrs, index; + unsigned long *msr_bitmap; + + vmx_load_host_state(vmx); + save_nmsrs = 0; +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) { + index = __find_msr_index(vmx, MSR_SYSCALL_MASK); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_LSTAR); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_CSTAR); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && vmx->rdtscp_enabled) + move_msr_up(vmx, index, save_nmsrs++); + /* + * MSR_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + index = __find_msr_index(vmx, MSR_STAR); + if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) + move_msr_up(vmx, index, save_nmsrs++); + } +#endif + index = __find_msr_index(vmx, MSR_EFER); + if (index >= 0 && update_transition_efer(vmx, index)) + move_msr_up(vmx, index, save_nmsrs++); + + vmx->save_nmsrs = save_nmsrs; + + if (cpu_has_vmx_msr_bitmap()) { + if (is_long_mode(&vmx->vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode; + else + msr_bitmap = vmx_msr_bitmap_legacy; + + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); + } +} + +/* + * reads and returns guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset -- 21.3 + */ +static u64 guest_read_tsc(void) +{ + u64 host_tsc, tsc_offset; + + rdtscll(host_tsc); + tsc_offset = vmcs_read64(TSC_OFFSET); + return host_tsc + tsc_offset; +} + +/* + * writes 'offset' into guest's timestamp counter offset register + */ +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + vmcs_write64(TSC_OFFSET, offset); +} + +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +{ + u64 offset = vmcs_read64(TSC_OFFSET); + vmcs_write64(TSC_OFFSET, offset + adjustment); +} + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + u64 data; + struct shared_msr_entry *msr; + + if (!pdata) { + printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); + return -EINVAL; + } + + switch (msr_index) { +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + data = vmcs_readl(GUEST_FS_BASE); + break; + case MSR_GS_BASE: + data = vmcs_readl(GUEST_GS_BASE); + break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(to_vmx(vcpu)); + data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + break; +#endif + case MSR_EFER: + return kvm_get_msr_common(vcpu, msr_index, pdata); + case MSR_IA32_TSC: + data = guest_read_tsc(); + break; + case MSR_IA32_SYSENTER_CS: + data = vmcs_read32(GUEST_SYSENTER_CS); + break; + case MSR_IA32_SYSENTER_EIP: + data = vmcs_readl(GUEST_SYSENTER_EIP); + break; + case MSR_IA32_SYSENTER_ESP: + data = vmcs_readl(GUEST_SYSENTER_ESP); + break; + case MSR_TSC_AUX: + if (!to_vmx(vcpu)->rdtscp_enabled) + return 1; + /* Otherwise falls through */ + default: + vmx_load_host_state(to_vmx(vcpu)); + msr = find_msr_entry(to_vmx(vcpu), msr_index); + if (msr) { + vmx_load_host_state(to_vmx(vcpu)); + data = msr->data; + break; + } + return kvm_get_msr_common(vcpu, msr_index, pdata); + } + + *pdata = data; + return 0; +} + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct shared_msr_entry *msr; + int ret = 0; + + switch (msr_index) { + case MSR_EFER: + vmx_load_host_state(vmx); + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + vmcs_writel(GUEST_FS_BASE, data); + break; + case MSR_GS_BASE: + vmcs_writel(GUEST_GS_BASE, data); + break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(vmx); + vmx->msr_guest_kernel_gs_base = data; + break; +#endif + case MSR_IA32_SYSENTER_CS: + vmcs_write32(GUEST_SYSENTER_CS, data); + break; + case MSR_IA32_SYSENTER_EIP: + vmcs_writel(GUEST_SYSENTER_EIP, data); + break; + case MSR_IA32_SYSENTER_ESP: + vmcs_writel(GUEST_SYSENTER_ESP, data); + break; + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, data); + break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, data); + vcpu->arch.pat = data; + break; + } + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; + case MSR_TSC_AUX: + if (!vmx->rdtscp_enabled) + return 1; + /* Check reserved bit, higher 32 bits should be zero */ + if ((data >> 32) != 0) + return 1; + /* Otherwise falls through */ + default: + msr = find_msr_entry(vmx, msr_index); + if (msr) { + vmx_load_host_state(vmx); + msr->data = data; + break; + } + ret = kvm_set_msr_common(vcpu, msr_index, data); + } + + return ret; +} + +static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + switch (reg) { + case VCPU_REGS_RSP: + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + break; + case VCPU_REGS_RIP: + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + break; + case VCPU_EXREG_PDPTR: + if (enable_ept) + ept_save_pdptrs(vcpu); + break; + default: + break; + } +} + +static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); + else + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + + update_exception_bitmap(vcpu); +} + +static __init int cpu_has_kvm_support(void) +{ + return cpu_has_vmx(); +} + +static __init int vmx_disabled_by_bios(void) +{ + u64 msr; + + rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); + if (msr & FEATURE_CONTROL_LOCKED) { + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) + && kvm_tboot_enabled()) + return 1; + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && !kvm_tboot_enabled()) { + printk(KERN_WARNING "kvm: disable TXT in the BIOS or " + " activate TXT before enabling KVM\n"); + return 1; + } + } + + return 0; + /* locked but not enabled */ +} + +static void kvm_cpu_vmxon(u64 addr) +{ + asm volatile (ASM_VMX_VMXON_RAX + : : "a"(&addr), "m"(addr) + : "memory", "cc"); +} + +static int hardware_enable(void *garbage) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + u64 old, test_bits; + + if (read_cr4() & X86_CR4_VMXE) + return -EBUSY; + + INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); + + test_bits = FEATURE_CONTROL_LOCKED; + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + if (kvm_tboot_enabled()) + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; + + if ((old & test_bits) != test_bits) { + /* enable and lock */ + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); + } + write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ + + if (vmm_exclusive) { + kvm_cpu_vmxon(phys_addr); + ept_sync_global(); + } + + kvm_store_gdt(&__get_cpu_var(host_gdt)); + + return 0; +} + +static void vmclear_local_vcpus(void) +{ + int cpu = raw_smp_processor_id(); + struct vcpu_vmx *vmx, *n; + + list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), + local_vcpus_link) + __vcpu_clear(vmx); +} + + +/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() + * tricks. + */ +static void kvm_cpu_vmxoff(void) +{ + asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); +} + +static void hardware_disable(void *garbage) +{ + if (vmm_exclusive) { + vmclear_local_vcpus(); + kvm_cpu_vmxoff(); + } + write_cr4(read_cr4() & ~X86_CR4_VMXE); +} + +static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, + u32 msr, u32 *result) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 ctl = ctl_min | ctl_opt; + + rdmsr(msr, vmx_msr_low, vmx_msr_high); + + ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ + ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ + + /* Ensure minimum (required) set of control bits are supported. */ + if (ctl_min & ~ctl) + return -EIO; + + *result = ctl; + return 0; +} + +static __init bool allow_1_setting(u32 msr, u32 ctl) +{ + u32 vmx_msr_low, vmx_msr_high; + + rdmsr(msr, vmx_msr_low, vmx_msr_high); + return vmx_msr_high & ctl; +} + +static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 min, opt, min2, opt2; + u32 _pin_based_exec_control = 0; + u32 _cpu_based_exec_control = 0; + u32 _cpu_based_2nd_exec_control = 0; + u32 _vmexit_control = 0; + u32 _vmentry_control = 0; + + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = PIN_BASED_VIRTUAL_NMIS; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control) < 0) + return -EIO; + + min = +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MOV_DR_EXITING | + CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | + CPU_BASED_INVLPG_EXITING; + + if (yield_on_hlt) + min |= CPU_BASED_HLT_EXITING; + + opt = CPU_BASED_TPR_SHADOW | + CPU_BASED_USE_MSR_BITMAPS | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, + &_cpu_based_exec_control) < 0) + return -EIO; +#ifdef CONFIG_X86_64 + if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & + ~CPU_BASED_CR8_STORE_EXITING; +#endif + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { + min2 = 0; + opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_ENABLE_VPID | + SECONDARY_EXEC_ENABLE_EPT | + SECONDARY_EXEC_UNRESTRICTED_GUEST | + SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_RDTSCP; + if (adjust_vmx_controls(min2, opt2, + MSR_IA32_VMX_PROCBASED_CTLS2, + &_cpu_based_2nd_exec_control) < 0) + return -EIO; + } +#ifndef CONFIG_X86_64 + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; +#endif + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { + /* CR3 accesses and invlpg don't need to cause VM Exits when EPT + enabled */ + _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); + rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, + vmx_capability.ept, vmx_capability.vpid); + } + + min = 0; +#ifdef CONFIG_X86_64 + min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; +#endif + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, + &_vmexit_control) < 0) + return -EIO; + + min = 0; + opt = VM_ENTRY_LOAD_IA32_PAT; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, + &_vmentry_control) < 0) + return -EIO; + + rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + + /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ + if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + return -EIO; + +#ifdef CONFIG_X86_64 + /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ + if (vmx_msr_high & (1u<<16)) + return -EIO; +#endif + + /* Require Write-Back (WB) memory type for VMCS accesses. */ + if (((vmx_msr_high >> 18) & 15) != 6) + return -EIO; + + vmcs_conf->size = vmx_msr_high & 0x1fff; + vmcs_conf->order = get_order(vmcs_config.size); + vmcs_conf->revision_id = vmx_msr_low; + + vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; + vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; + vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->vmexit_ctrl = _vmexit_control; + vmcs_conf->vmentry_ctrl = _vmentry_control; + + cpu_has_load_ia32_efer = + allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, + VM_ENTRY_LOAD_IA32_EFER) + && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, + VM_EXIT_LOAD_IA32_EFER); + + return 0; +} + +static struct vmcs *alloc_vmcs_cpu(int cpu) +{ + int node = cpu_to_node(cpu); + struct page *pages; + struct vmcs *vmcs; + + pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); + if (!pages) + return NULL; + vmcs = page_address(pages); + memset(vmcs, 0, vmcs_config.size); + vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ + return vmcs; +} + +static struct vmcs *alloc_vmcs(void) +{ + return alloc_vmcs_cpu(raw_smp_processor_id()); +} + +static void free_vmcs(struct vmcs *vmcs) +{ + free_pages((unsigned long)vmcs, vmcs_config.order); +} + +static void free_kvm_area(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + free_vmcs(per_cpu(vmxarea, cpu)); + per_cpu(vmxarea, cpu) = NULL; + } +} + +static __init int alloc_kvm_area(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct vmcs *vmcs; + + vmcs = alloc_vmcs_cpu(cpu); + if (!vmcs) { + free_kvm_area(); + return -ENOMEM; + } + + per_cpu(vmxarea, cpu) = vmcs; + } + return 0; +} + +static __init int hardware_setup(void) +{ + if (setup_vmcs_config(&vmcs_config) < 0) + return -EIO; + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + + if (!cpu_has_vmx_vpid()) + enable_vpid = 0; + + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels()) { + enable_ept = 0; + enable_unrestricted_guest = 0; + } + + if (!cpu_has_vmx_unrestricted_guest()) + enable_unrestricted_guest = 0; + + if (!cpu_has_vmx_flexpriority()) + flexpriority_enabled = 0; + + if (!cpu_has_vmx_tpr_shadow()) + kvm_x86_ops->update_cr8_intercept = NULL; + + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + + if (!cpu_has_vmx_ple()) + ple_gap = 0; + + return alloc_kvm_area(); +} + +static __exit void hardware_unsetup(void) +{ + free_kvm_area(); +} + +static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { + vmcs_write16(sf->selector, save->selector); + vmcs_writel(sf->base, save->base); + vmcs_write32(sf->limit, save->limit); + vmcs_write32(sf->ar_bytes, save->ar); + } else { + u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) + << AR_DPL_SHIFT; + vmcs_write32(sf->ar_bytes, 0x93 | dpl); + } +} + +static void enter_pmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx->emulation_required = 1; + vmx->rmode.vm86_active = 0; + + vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + vmcs_writel(GUEST_RFLAGS, flags); + + vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | + (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); + + update_exception_bitmap(vcpu); + + if (emulate_invalid_guest_state) + return; + + fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); + + vmcs_write16(GUEST_SS_SELECTOR, 0); + vmcs_write32(GUEST_SS_AR_BYTES, 0x93); + + vmcs_write16(GUEST_CS_SELECTOR, + vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); + vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); +} + +static gva_t rmode_tss_base(struct kvm *kvm) +{ + if (!kvm->arch.tss_addr) { + struct kvm_memslots *slots; + gfn_t base_gfn; + + slots = kvm_memslots(kvm); + base_gfn = slots->memslots[0].base_gfn + + kvm->memslots->memslots[0].npages - 3; + return base_gfn << PAGE_SHIFT; + } + return kvm->arch.tss_addr; +} + +static void fix_rmode_seg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + save->selector = vmcs_read16(sf->selector); + save->base = vmcs_readl(sf->base); + save->limit = vmcs_read32(sf->limit); + save->ar = vmcs_read32(sf->ar_bytes); + vmcs_write16(sf->selector, save->base >> 4); + vmcs_write32(sf->base, save->base & 0xffff0); + vmcs_write32(sf->limit, 0xffff); + vmcs_write32(sf->ar_bytes, 0xf3); + if (save->base & 0xf) + printk_once(KERN_WARNING "kvm: segment base is not paragraph" + " aligned when entering protected mode (seg=%d)", + seg); +} + +static void enter_rmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (enable_unrestricted_guest) + return; + + vmx->emulation_required = 1; + vmx->rmode.vm86_active = 1; + + vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); + + vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); + + vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + flags = vmcs_readl(GUEST_RFLAGS); + vmx->rmode.save_rflags = flags; + + flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + + vmcs_writel(GUEST_RFLAGS, flags); + vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); + update_exception_bitmap(vcpu); + + if (emulate_invalid_guest_state) + goto continue_rmode; + + vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); + vmcs_write32(GUEST_SS_LIMIT, 0xffff); + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); + vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); + + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + +continue_rmode: + kvm_mmu_reset_context(vcpu); + init_rmode(vcpu->kvm); +} + +static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + + if (!msr) + return; + + /* + * Force kernel_gs_base reloading before EFER changes, as control + * of this msr depends on is_long_mode(). + */ + vmx_load_host_state(to_vmx(vcpu)); + vcpu->arch.efer = efer; + if (efer & EFER_LMA) { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) | + VM_ENTRY_IA32E_MODE); + msr->data = efer; + } else { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) & + ~VM_ENTRY_IA32E_MODE); + + msr->data = efer & ~EFER_LME; + } + setup_msrs(vmx); +} + +#ifdef CONFIG_X86_64 + +static void enter_lmode(struct kvm_vcpu *vcpu) +{ + u32 guest_tr_ar; + + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); + if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { + printk(KERN_DEBUG "%s: tss fixup for long mode. \n", + __func__); + vmcs_write32(GUEST_TR_AR_BYTES, + (guest_tr_ar & ~AR_TYPE_MASK) + | AR_TYPE_BUSY_64_TSS); + } + vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); +} + +static void exit_lmode(struct kvm_vcpu *vcpu) +{ + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) + & ~VM_ENTRY_IA32E_MODE); + vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); +} + +#endif + +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + vpid_sync_context(to_vmx(vcpu)); + if (enable_ept) { + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + } +} + +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} + +static void vmx_decache_cr3(struct kvm_vcpu *vcpu) +{ + if (enable_ept && is_paging(vcpu)) + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); +} + +static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; +} + +static void ept_load_pdptrs(struct kvm_vcpu *vcpu) +{ + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty)) + return; + + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); + } +} + +static void ept_save_pdptrs(struct kvm_vcpu *vcpu) +{ + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + } + + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); +} + +static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); + +static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, + unsigned long cr0, + struct kvm_vcpu *vcpu) +{ + vmx_decache_cr3(vcpu); + if (!(cr0 & X86_CR0_PG)) { + /* From paging/starting to nonpaging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | + (CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 = cr0; + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } else if (!is_paging(vcpu)) { + /* From nonpaging to paging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 = cr0; + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } + + if (!(cr0 & X86_CR0_WP)) + *hw_cr0 &= ~X86_CR0_WP; +} + +static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long hw_cr0; + + if (enable_unrestricted_guest) + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) + | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + else + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; + + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) + enter_pmode(vcpu); + + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) + enter_rmode(vcpu); + +#ifdef CONFIG_X86_64 + if (vcpu->arch.efer & EFER_LME) { + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) + enter_lmode(vcpu); + if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) + exit_lmode(vcpu); + } +#endif + + if (enable_ept) + ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + + if (!vcpu->fpu_active) + hw_cr0 |= X86_CR0_TS | X86_CR0_MP; + + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, hw_cr0); + vcpu->arch.cr0 = cr0; +} + +static u64 construct_eptp(unsigned long root_hpa) +{ + u64 eptp; + + /* TODO write the value reading from MSR */ + eptp = VMX_EPT_DEFAULT_MT | + VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; + eptp |= (root_hpa & PAGE_MASK); + + return eptp; +} + +static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + unsigned long guest_cr3; + u64 eptp; + + guest_cr3 = cr3; + if (enable_ept) { + eptp = construct_eptp(cr3); + vmcs_write64(EPT_POINTER, eptp); + guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : + vcpu->kvm->arch.ept_identity_map_addr; + ept_load_pdptrs(vcpu); + } + + vmx_flush_tlb(vcpu); + vmcs_writel(GUEST_CR3, guest_cr3); +} + +static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + + vcpu->arch.cr4 = cr4; + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } + } + + vmcs_writel(CR4_READ_SHADOW, cr4); + vmcs_writel(GUEST_CR4, hw_cr4); +} + +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + return vmcs_readl(sf->base); +} + +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + var->base = vmcs_readl(sf->base); + var->limit = vmcs_read32(sf->limit); + var->selector = vmcs_read16(sf->selector); + ar = vmcs_read32(sf->ar_bytes); + if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) + ar = 0; + var->type = ar & 15; + var->s = (ar >> 4) & 1; + var->dpl = (ar >> 5) & 3; + var->present = (ar >> 7) & 1; + var->avl = (ar >> 12) & 1; + var->l = (ar >> 13) & 1; + var->db = (ar >> 14) & 1; + var->g = (ar >> 15) & 1; + var->unusable = (ar >> 16) & 1; +} + +static int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + if (!is_protmode(vcpu)) + return 0; + + if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ + return 3; + + return vmcs_read16(GUEST_CS_SELECTOR) & 3; +} + +static u32 vmx_segment_access_rights(struct kvm_segment *var) +{ + u32 ar; + + if (var->unusable) + ar = 1 << 16; + else { + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + } + if (ar == 0) /* a 0 value means unusable */ + ar = AR_UNUSABLE_MASK; + + return ar; +} + +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmx->rmode.tr.selector = var->selector; + vmx->rmode.tr.base = var->base; + vmx->rmode.tr.limit = var->limit; + vmx->rmode.tr.ar = vmx_segment_access_rights(var); + return; + } + vmcs_writel(sf->base, var->base); + vmcs_write32(sf->limit, var->limit); + vmcs_write16(sf->selector, var->selector); + if (vmx->rmode.vm86_active && var->s) { + /* + * Hack real-mode segments into vm86 compatibility. + */ + if (var->base == 0xffff0000 && var->selector == 0xf000) + vmcs_writel(sf->base, 0xf0000); + ar = 0xf3; + } else + ar = vmx_segment_access_rights(var); + + /* + * Fix the "Accessed" bit in AR field of segment registers for older + * qemu binaries. + * IA32 arch specifies that at the time of processor reset the + * "Accessed" bit in the AR field of segment registers is 1. And qemu + * is setting it to 0 in the usedland code. This causes invalid guest + * state vmexit when "unrestricted guest" mode is turned on. + * Fix for this setup issue in cpu_reset is being pushed in the qemu + * tree. Newer qemu binaries with that qemu fix would not need this + * kvm hack. + */ + if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + ar |= 0x1; /* Accessed */ + + vmcs_write32(sf->ar_bytes, ar); +} + +static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); + + *db = (ar >> 14) & 1; + *l = (ar >> 13) & 1; +} + +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +{ + dt->size = vmcs_read32(GUEST_IDTR_LIMIT); + dt->address = vmcs_readl(GUEST_IDTR_BASE); +} + +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +{ + vmcs_write32(GUEST_IDTR_LIMIT, dt->size); + vmcs_writel(GUEST_IDTR_BASE, dt->address); +} + +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +{ + dt->size = vmcs_read32(GUEST_GDTR_LIMIT); + dt->address = vmcs_readl(GUEST_GDTR_BASE); +} + +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +{ + vmcs_write32(GUEST_GDTR_LIMIT, dt->size); + vmcs_writel(GUEST_GDTR_BASE, dt->address); +} + +static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + u32 ar; + + vmx_get_segment(vcpu, &var, seg); + ar = vmx_segment_access_rights(&var); + + if (var.base != (var.selector << 4)) + return false; + if (var.limit != 0xffff) + return false; + if (ar != 0xf3) + return false; + + return true; +} + +static bool code_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + unsigned int cs_rpl; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + cs_rpl = cs.selector & SELECTOR_RPL_MASK; + + if (cs.unusable) + return false; + if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) + return false; + if (!cs.s) + return false; + if (cs.type & AR_TYPE_WRITEABLE_MASK) { + if (cs.dpl > cs_rpl) + return false; + } else { + if (cs.dpl != cs_rpl) + return false; + } + if (!cs.present) + return false; + + /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ + return true; +} + +static bool stack_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ss; + unsigned int ss_rpl; + + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + ss_rpl = ss.selector & SELECTOR_RPL_MASK; + + if (ss.unusable) + return true; + if (ss.type != 3 && ss.type != 7) + return false; + if (!ss.s) + return false; + if (ss.dpl != ss_rpl) /* DPL != RPL */ + return false; + if (!ss.present) + return false; + + return true; +} + +static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + unsigned int rpl; + + vmx_get_segment(vcpu, &var, seg); + rpl = var.selector & SELECTOR_RPL_MASK; + + if (var.unusable) + return true; + if (!var.s) + return false; + if (!var.present) + return false; + if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { + if (var.dpl < rpl) /* DPL < RPL */ + return false; + } + + /* TODO: Add other members to kvm_segment_field to allow checking for other access + * rights flags + */ + return true; +} + +static bool tr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment tr; + + vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); + + if (tr.unusable) + return false; + if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ + return false; + if (!tr.present) + return false; + + return true; +} + +static bool ldtr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ldtr; + + vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); + + if (ldtr.unusable) + return true; + if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if (ldtr.type != 2) + return false; + if (!ldtr.present) + return false; + + return true; +} + +static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ss; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + + return ((cs.selector & SELECTOR_RPL_MASK) == + (ss.selector & SELECTOR_RPL_MASK)); +} + +/* + * Check if guest state is valid. Returns true if valid, false if + * not. + * We assume that registers are always usable + */ +static bool guest_state_valid(struct kvm_vcpu *vcpu) +{ + /* real mode guest state checks */ + if (!is_protmode(vcpu)) { + if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + } else { + /* protected mode guest state checks */ + if (!cs_ss_rpl_check(vcpu)) + return false; + if (!code_segment_valid(vcpu)) + return false; + if (!stack_segment_valid(vcpu)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + if (!tr_valid(vcpu)) + return false; + if (!ldtr_valid(vcpu)) + return false; + } + /* TODO: + * - Add checks on RIP + * - Add checks on RFLAGS + */ + + return true; +} + +static int init_rmode_tss(struct kvm *kvm) +{ + gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + u16 data = 0; + int ret = 0; + int r; + + r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); + if (r < 0) + goto out; + data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; + r = kvm_write_guest_page(kvm, fn++, &data, + TSS_IOPB_BASE_OFFSET, sizeof(u16)); + if (r < 0) + goto out; + r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); + if (r < 0) + goto out; + r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); + if (r < 0) + goto out; + data = ~0; + r = kvm_write_guest_page(kvm, fn, &data, + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, + sizeof(u8)); + if (r < 0) + goto out; + + ret = 1; +out: + return ret; +} + +static int init_rmode_identity_map(struct kvm *kvm) +{ + int i, r, ret; + pfn_t identity_map_pfn; + u32 tmp; + + if (!enable_ept) + return 1; + if (unlikely(!kvm->arch.ept_identity_pagetable)) { + printk(KERN_ERR "EPT: identity-mapping pagetable " + "haven't been allocated!\n"); + return 0; + } + if (likely(kvm->arch.ept_identity_pagetable_done)) + return 1; + ret = 0; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); + if (r < 0) + goto out; + /* Set up identity-mapping pagetable for EPT in real mode */ + for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); + r = kvm_write_guest_page(kvm, identity_map_pfn, + &tmp, i * sizeof(tmp), sizeof(tmp)); + if (r < 0) + goto out; + } + kvm->arch.ept_identity_pagetable_done = true; + ret = 1; +out: + return ret; +} + +static void seg_setup(int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + unsigned int ar; + + vmcs_write16(sf->selector, 0); + vmcs_writel(sf->base, 0); + vmcs_write32(sf->limit, 0xffff); + if (enable_unrestricted_guest) { + ar = 0x93; + if (seg == VCPU_SREG_CS) + ar |= 0x08; /* code segment */ + } else + ar = 0xf3; + + vmcs_write32(sf->ar_bytes, ar); +} + +static int alloc_apic_access_page(struct kvm *kvm) +{ + struct kvm_userspace_memory_region kvm_userspace_mem; + int r = 0; + + mutex_lock(&kvm->slots_lock); + if (kvm->arch.apic_access_page) + goto out; + kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; + kvm_userspace_mem.flags = 0; + kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; + kvm_userspace_mem.memory_size = PAGE_SIZE; + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; + + kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static int alloc_identity_pagetable(struct kvm *kvm) +{ + struct kvm_userspace_memory_region kvm_userspace_mem; + int r = 0; + + mutex_lock(&kvm->slots_lock); + if (kvm->arch.ept_identity_pagetable) + goto out; + kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; + kvm_userspace_mem.flags = 0; + kvm_userspace_mem.guest_phys_addr = + kvm->arch.ept_identity_map_addr; + kvm_userspace_mem.memory_size = PAGE_SIZE; + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; + + kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, + kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static void allocate_vpid(struct vcpu_vmx *vmx) +{ + int vpid; + + vmx->vpid = 0; + if (!enable_vpid) + return; + spin_lock(&vmx_vpid_lock); + vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); + if (vpid < VMX_NR_VPIDS) { + vmx->vpid = vpid; + __set_bit(vpid, vmx_vpid_bitmap); + } + spin_unlock(&vmx_vpid_lock); +} + +static void free_vpid(struct vcpu_vmx *vmx) +{ + if (!enable_vpid) + return; + spin_lock(&vmx_vpid_lock); + if (vmx->vpid != 0) + __clear_bit(vmx->vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); +} + +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ + __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ + } +} + +static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) +{ + if (!longmode_only) + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); +} + +/* + * Sets up the vmcs for emulated real mode. + */ +static int vmx_vcpu_setup(struct vcpu_vmx *vmx) +{ + u32 host_sysenter_cs, msr_low, msr_high; + u32 junk; + u64 host_pat; + unsigned long a; + struct kvm_desc_ptr dt; + int i; + unsigned long kvm_vmx_return; + u32 exec_control; + + /* I/O */ + vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); + vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); + + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + + /* Control */ + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, + vmcs_config.pin_based_exec_ctrl); + + exec_control = vmcs_config.cpu_based_exec_ctrl; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { + exec_control &= ~CPU_BASED_TPR_SHADOW; +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_STORE_EXITING | + CPU_BASED_CR8_LOAD_EXITING; +#endif + } + if (!enable_ept) + exec_control |= CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_INVLPG_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + exec_control &= + ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (vmx->vpid == 0) + exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; + if (!enable_ept) { + exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + enable_unrestricted_guest = 0; + } + if (!enable_unrestricted_guest) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (!ple_gap) + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + } + + if (ple_gap) { + vmcs_write32(PLE_GAP, ple_gap); + vmcs_write32(PLE_WINDOW, ple_window); + } + + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); + vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + + vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ + vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ + vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ + vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +#ifdef CONFIG_X86_64 + rdmsrl(MSR_FS_BASE, a); + vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ + rdmsrl(MSR_GS_BASE, a); + vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ +#else + vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ + vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ +#endif + + vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ + + kvm_native_store_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ + + asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); + vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + + rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); + vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); + rdmsrl(MSR_IA32_SYSENTER_ESP, a); + vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ + rdmsrl(MSR_IA32_SYSENTER_EIP, a); + vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ + + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((u64) msr_high << 32); + vmcs_write64(HOST_IA32_PAT, host_pat); + } + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((u64) msr_high << 32); + /* Write the default value follow host pat */ + vmcs_write64(GUEST_IA32_PAT, host_pat); + /* Keep arch.pat sync with GUEST_IA32_PAT */ + vmx->vcpu.arch.pat = host_pat; + } + + for (i = 0; i < NR_VMX_MSR; ++i) { + u32 index = vmx_msr_index[i]; + u32 data_low, data_high; + int j = vmx->nmsrs; + + if (rdmsr_safe(index, &data_low, &data_high) < 0) + continue; + if (wrmsr_safe(index, data_low, data_high) < 0) + continue; + vmx->guest_msrs[j].index = i; + vmx->guest_msrs[j].data = 0; + vmx->guest_msrs[j].mask = -1ull; + ++vmx->nmsrs; + } + + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* 22.2.1, 20.8.1 */ + vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); + + vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); + + kvm_write_tsc(&vmx->vcpu, 0); + + return 0; +} + +static int init_rmode(struct kvm *kvm) +{ + int idx, ret = 0; + + idx = srcu_read_lock(&kvm->srcu); + if (!init_rmode_tss(kvm)) + goto exit; + if (!init_rmode_identity_map(kvm)) + goto exit; + + ret = 1; +exit: + srcu_read_unlock(&kvm->srcu, idx); + return ret; +} + +static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 msr; + int ret; + + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + if (!init_rmode(vmx->vcpu.kvm)) { + ret = -ENOMEM; + goto out; + } + + vmx->rmode.vm86_active = 0; + + vmx->soft_vnmi_blocked = 0; + + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); + kvm_set_cr8(&vmx->vcpu, 0); + msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_bsp(&vmx->vcpu)) + msr |= MSR_IA32_APICBASE_BSP; + kvm_set_apic_base(&vmx->vcpu, msr); + + ret = fx_init(&vmx->vcpu); + if (ret != 0) + goto out; + + seg_setup(VCPU_SREG_CS); + /* + * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode + * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. + */ + if (kvm_vcpu_is_bsp(&vmx->vcpu)) { + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(GUEST_CS_BASE, 0x000f0000); + } else { + vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); + vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); + } + + seg_setup(VCPU_SREG_DS); + seg_setup(VCPU_SREG_ES); + seg_setup(VCPU_SREG_FS); + seg_setup(VCPU_SREG_GS); + seg_setup(VCPU_SREG_SS); + + vmcs_write16(GUEST_TR_SELECTOR, 0); + vmcs_writel(GUEST_TR_BASE, 0); + vmcs_write32(GUEST_TR_LIMIT, 0xffff); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + vmcs_write16(GUEST_LDTR_SELECTOR, 0); + vmcs_writel(GUEST_LDTR_BASE, 0); + vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); + vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + + vmcs_writel(GUEST_RFLAGS, 0x02); + if (kvm_vcpu_is_bsp(&vmx->vcpu)) + kvm_rip_write(vcpu, 0xfff0); + else + kvm_rip_write(vcpu, 0); + kvm_register_write(vcpu, VCPU_REGS_RSP, 0); + + vmcs_writel(GUEST_DR7, 0x400); + + vmcs_writel(GUEST_GDTR_BASE, 0); + vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); + + vmcs_writel(GUEST_IDTR_BASE, 0); + vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); + vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); + + /* Special registers */ + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + setup_msrs(vmx); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (vm_need_tpr_shadow(vmx->vcpu.kvm)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + page_to_phys(vmx->vcpu.arch.apic->regs_page)); + vmcs_write32(TPR_THRESHOLD, 0); + } + + if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + vmcs_write64(APIC_ACCESS_ADDR, + page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + + if (vmx->vpid != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + + vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ + vmx_set_cr4(&vmx->vcpu, 0); + vmx_set_efer(&vmx->vcpu, 0); + vmx_fpu_activate(&vmx->vcpu); + update_exception_bitmap(&vmx->vcpu); + + vpid_sync_context(vmx); + + ret = 0; + + /* HACK: Don't enable emulation on guest boot/reset */ + vmx->emulation_required = 0; + +out: + return ret; +} + +static void enable_irq_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + +static void enable_nmi_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + if (!cpu_has_virtual_nmis()) { + enable_irq_window(vcpu); + return; + } + + if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + enable_irq_window(vcpu); + return; + } + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + +static void vmx_inject_irq(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + uint32_t intr; + int irq = vcpu->arch.interrupt.nr; + + trace_kvm_inj_virq(irq); + + ++vcpu->stat.irq_injections; + if (vmx->rmode.vm86_active) { + if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return; + } + intr = irq | INTR_INFO_VALID_MASK; + if (vcpu->arch.interrupt.soft) { + intr |= INTR_TYPE_SOFT_INTR; + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); + } else + intr |= INTR_TYPE_EXT_INTR; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); + vmx_clear_hlt(vcpu); +} + +static void vmx_inject_nmi(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_virtual_nmis()) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->soft_vnmi_blocked = 1; + vmx->vnmi_blocked_time = 0; + } + + ++vcpu->stat.nmi_injections; + if (vmx->rmode.vm86_active) { + if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); + vmx_clear_hlt(vcpu); +} + +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) +{ + if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) + return 0; + + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI + | GUEST_INTR_STATE_NMI)); +} + +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + if (!cpu_has_virtual_nmis()) + return to_vmx(vcpu)->soft_vnmi_blocked; + return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; +} + +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_virtual_nmis()) { + if (vmx->soft_vnmi_blocked != masked) { + vmx->soft_vnmi_blocked = masked; + vmx->vnmi_blocked_time = 0; + } + } else { + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } +} + +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); +} + +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + int ret; + struct kvm_userspace_memory_region tss_mem = { + .slot = TSS_PRIVATE_MEMSLOT, + .guest_phys_addr = addr, + .memory_size = PAGE_SIZE * 3, + .flags = 0, + }; + + ret = kvm_set_memory_region(kvm, &tss_mem, 0); + if (ret) + return ret; + kvm->arch.tss_addr = addr; + return 0; +} + +static int handle_rmode_exception(struct kvm_vcpu *vcpu, + int vec, u32 err_code) +{ + /* + * Instruction with address size override prefix opcode 0x67 + * Cause the #SS fault with 0 error code in VM86 mode. + */ + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) + if (emulate_instruction(vcpu, 0) == EMULATE_DONE) + return 1; + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + switch (vec) { + case DB_VECTOR: + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return 0; + kvm_queue_exception(vcpu, vec); + return 1; + case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + return 0; + /* fall through */ + case DE_VECTOR: + case OF_VECTOR: + case BR_VECTOR: + case UD_VECTOR: + case DF_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + case MF_VECTOR: + kvm_queue_exception(vcpu, vec); + return 1; + } + return 0; +} + +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static void kvm_machine_check(void) +{ +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + struct pt_regs regs = { + .kvm_pt_regs_cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .kvm_pt_regs_flags = X86_EFLAGS_IF, + }; + + kvm_do_machine_check(®s, 0); +#endif +} + +static int handle_machine_check(struct kvm_vcpu *vcpu) +{ + /* already handled by vcpu_run */ + return 1; +} + +static int handle_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_run *kvm_run = vcpu->run; + u32 intr_info, ex_no, error_code; + unsigned long cr2, rip, dr6; + u32 vect_info; + enum emulation_result er; + + vect_info = vmx->idt_vectoring_info; + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + if (is_machine_check(intr_info)) + return handle_machine_check(vcpu); + + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !is_page_fault(intr_info)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } + + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) + return 1; /* already handled by vmx_vcpu_run() */ + + if (is_no_device(intr_info)) { + vmx_fpu_activate(vcpu); + return 1; + } + + if (is_invalid_opcode(intr_info)) { + er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); + if (er != EMULATE_DONE) + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + error_code = 0; + rip = kvm_rip_read(vcpu); + if (intr_info & INTR_INFO_DELIVER_CODE_MASK) + error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + if (is_page_fault(intr_info)) { + /* EPT won't cause page fault directly */ + if (enable_ept) + BUG(); + cr2 = vmcs_readl(EXIT_QUALIFICATION); + trace_kvm_page_fault(cr2, error_code); + + if (kvm_event_needs_reinjection(vcpu)) + kvm_mmu_unprotect_page_virt(vcpu, cr2); + return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); + } + + if (vmx->rmode.vm86_active && + handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, + error_code)) { + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + return kvm_emulate_halt(vcpu); + } + return 1; + } + + ex_no = intr_info & INTR_INFO_VECTOR_MASK; + switch (ex_no) { + case DB_VECTOR: + dr6 = vmcs_readl(EXIT_QUALIFICATION); + if (!(vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + vcpu->arch.dr6 = dr6 | DR6_FIXED_1; + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; + } + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); + /* fall through */ + case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.exception = ex_no; + break; + default: + kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->ex.exception = ex_no; + kvm_run->ex.error_code = error_code; + break; + } + return 0; +} + +static int handle_external_interrupt(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.irq_exits; + return 1; +} + +static int handle_triple_fault(struct kvm_vcpu *vcpu) +{ + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + return 0; +} + +static int handle_io(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + int size, in, string; + unsigned port; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + string = (exit_qualification & 16) != 0; + in = (exit_qualification & 8) != 0; + + ++vcpu->stat.io_exits; + + if (string || in) + return emulate_instruction(vcpu, 0) == EMULATE_DONE; + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + skip_emulated_instruction(vcpu); + + return kvm_fast_pio_out(vcpu, size, port); +} + +static void +vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +{ + /* + * Patch in the VMCALL instruction: + */ + hypercall[0] = 0x0f; + hypercall[1] = 0x01; + hypercall[2] = 0xc1; +} + +static int handle_cr(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification, val; + int cr; + int reg; + int err; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + cr = exit_qualification & 15; + reg = (exit_qualification >> 8) & 15; + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + val = kvm_register_read(vcpu, reg); + trace_kvm_cr_write(cr, val); + switch (cr) { + case 0: + err = kvm_set_cr0(vcpu, val); + kvm_complete_insn_gp(vcpu, err); + return 1; + case 3: + err = kvm_set_cr3(vcpu, val); + kvm_complete_insn_gp(vcpu, err); + return 1; + case 4: + err = kvm_set_cr4(vcpu, val); + kvm_complete_insn_gp(vcpu, err); + return 1; + case 8: { + u8 cr8_prev = kvm_get_cr8(vcpu); + u8 cr8 = kvm_register_read(vcpu, reg); + err = kvm_set_cr8(vcpu, cr8); + kvm_complete_insn_gp(vcpu, err); + if (irqchip_in_kernel(vcpu->kvm)) + return 1; + if (cr8_prev <= cr8) + return 1; + vcpu->run->exit_reason = KVM_EXIT_SET_TPR; + return 0; + } + }; + break; + case 2: /* clts */ + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); + skip_emulated_instruction(vcpu); + vmx_fpu_activate(vcpu); + return 1; + case 1: /*mov from cr*/ + switch (cr) { + case 3: + val = kvm_read_cr3(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); + skip_emulated_instruction(vcpu); + return 1; + case 8: + val = kvm_get_cr8(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); + skip_emulated_instruction(vcpu); + return 1; + } + break; + case 3: /* lmsw */ + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + kvm_lmsw(vcpu, val); + + skip_emulated_instruction(vcpu); + return 1; + default: + break; + } + vcpu->run->exit_reason = 0; + pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", + (int)(exit_qualification >> 4) & 3, cr); + return 0; +} + +static int handle_dr(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + int dr, reg; + + /* Do not handle if the CPL > 0, will trigger GP on re-entry */ + if (!kvm_require_cpl(vcpu, 0)) + return 1; + dr = vmcs_readl(GUEST_DR7); + if (dr & DR7_GD) { + /* + * As the vm-exit takes precedence over the debug trap, we + * need to emulate the latter, either for the host or the + * guest debugging itself. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { + vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; + vcpu->run->debug.arch.dr7 = dr; + vcpu->run->debug.arch.pc = + vmcs_readl(GUEST_CS_BASE) + + vmcs_readl(GUEST_RIP); + vcpu->run->debug.arch.exception = DB_VECTOR; + vcpu->run->exit_reason = KVM_EXIT_DEBUG; + return 0; + } else { + vcpu->arch.dr7 &= ~DR7_GD; + vcpu->arch.dr6 |= DR6_BD; + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; + } + } + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + dr = exit_qualification & DEBUG_REG_ACCESS_NUM; + reg = DEBUG_REG_ACCESS_REG(exit_qualification); + if (exit_qualification & TYPE_MOV_FROM_DR) { + unsigned long val; + if (!kvm_get_dr(vcpu, dr, &val)) + kvm_register_write(vcpu, reg, val); + } else + kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); + skip_emulated_instruction(vcpu); + return 1; +} + +static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + vmcs_writel(GUEST_DR7, val); +} + +static int handle_cpuid(struct kvm_vcpu *vcpu) +{ + kvm_emulate_cpuid(vcpu); + return 1; +} + +static int handle_rdmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + u64 data; + + if (vmx_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_read(ecx, data); + + /* FIXME: handling of bits 32:63 of rax, rdx */ + vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; + vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_wrmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) + | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + + if (vmx_set_msr(vcpu, ecx, data) != 0) { + trace_kvm_msr_write_ex(ecx, data); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_write(ecx, data); + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 1; +} + +static int handle_interrupt_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + /* clear pending irq */ + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + kvm_make_request(KVM_REQ_EVENT, vcpu); + + ++vcpu->stat.irq_window_exits; + + /* + * If the user space waits to inject interrupts, exit as soon as + * possible + */ + if (!irqchip_in_kernel(vcpu->kvm) && + vcpu->run->request_interrupt_window && + !kvm_cpu_has_interrupt(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + return 0; + } + return 1; +} + +static int handle_halt(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + return kvm_emulate_halt(vcpu); +} + +static int handle_vmcall(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + kvm_emulate_hypercall(vcpu); + return 1; +} + +static int handle_vmx_insn(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + +static int handle_invd(struct kvm_vcpu *vcpu) +{ + return emulate_instruction(vcpu, 0) == EMULATE_DONE; +} + +static int handle_invlpg(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + kvm_mmu_invlpg(vcpu, exit_qualification); + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_wbinvd(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + kvm_emulate_wbinvd(vcpu); + return 1; +} + +static int handle_xsetbv(struct kvm_vcpu *vcpu) +{ + u64 new_bv = kvm_read_edx_eax(vcpu); + u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); + + if (kvm_set_xcr(vcpu, index, new_bv) == 0) + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_apic_access(struct kvm_vcpu *vcpu) +{ + return emulate_instruction(vcpu, 0) == EMULATE_DONE; +} + +static int handle_task_switch(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification; + bool has_error_code = false; + u32 error_code = 0; + u16 tss_selector; + int reason, type, idt_v; + + idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + reason = (u32)exit_qualification >> 30; + if (reason == TASK_SWITCH_GATE && idt_v) { + switch (type) { + case INTR_TYPE_NMI_INTR: + vcpu->arch.nmi_injected = false; + if (cpu_has_virtual_nmis()) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + break; + case INTR_TYPE_EXT_INTR: + case INTR_TYPE_SOFT_INTR: + kvm_clear_interrupt_queue(vcpu); + break; + case INTR_TYPE_HARD_EXCEPTION: + if (vmx->idt_vectoring_info & + VECTORING_INFO_DELIVER_CODE_MASK) { + has_error_code = true; + error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + } + /* fall through */ + case INTR_TYPE_SOFT_EXCEPTION: + kvm_clear_exception_queue(vcpu); + break; + default: + break; + } + } + tss_selector = exit_qualification; + + if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && + type != INTR_TYPE_EXT_INTR && + type != INTR_TYPE_NMI_INTR)) + skip_emulated_instruction(vcpu); + + if (kvm_task_switch(vcpu, tss_selector, reason, + has_error_code, error_code) == EMULATE_FAIL) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + /* clear all local breakpoint enable flags */ + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); + + /* + * TODO: What about debug traps on tss switch? + * Are we supposed to inject them and update dr6? + */ + + return 1; +} + +static int handle_ept_violation(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + gpa_t gpa; + int gla_validity; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + if (exit_qualification & (1 << 6)) { + printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); + return -EINVAL; + } + + gla_validity = (exit_qualification >> 7) & 0x3; + if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { + printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); + printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", + (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), + vmcs_readl(GUEST_LINEAR_ADDRESS)); + printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", + (long unsigned int)exit_qualification); + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; + return 0; + } + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + trace_kvm_page_fault(gpa, exit_qualification); + return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); +} + +static u64 ept_rsvd_mask(u64 spte, int level) +{ + int i; + u64 mask = 0; + + for (i = 51; i > kvm_x86_phys_bits; i--) + mask |= (1ULL << i); + + if (level > 2) + /* bits 7:3 reserved */ + mask |= 0xf8; + else if (level == 2) { + if (spte & (1ULL << 7)) + /* 2MB ref, bits 20:12 reserved */ + mask |= 0x1ff000; + else + /* bits 6:3 reserved */ + mask |= 0x78; + } + + return mask; +} + +static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, + int level) +{ + printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); + + /* 010b (write-only) */ + WARN_ON((spte & 0x7) == 0x2); + + /* 110b (write/execute) */ + WARN_ON((spte & 0x7) == 0x6); + + /* 100b (execute-only) and value not supported by logical processor */ + if (!cpu_has_vmx_ept_execute_only()) + WARN_ON((spte & 0x7) == 0x4); + + /* not 000b */ + if ((spte & 0x7)) { + u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); + + if (rsvd_bits != 0) { + printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", + __func__, rsvd_bits); + WARN_ON(1); + } + + if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { + u64 ept_mem_type = (spte & 0x38) >> 3; + + if (ept_mem_type == 2 || ept_mem_type == 3 || + ept_mem_type == 7) { + printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", + __func__, ept_mem_type); + WARN_ON(1); + } + } + } +} + +static int handle_ept_misconfig(struct kvm_vcpu *vcpu) +{ + u64 sptes[4]; + int nr_sptes, i; + gpa_t gpa; + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + + printk(KERN_ERR "EPT: Misconfiguration.\n"); + printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); + + nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); + + for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) + ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); + + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; + + return 0; +} + +static int handle_nmi_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + /* clear pending NMI */ + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + ++vcpu->stat.nmi_window_exits; + kvm_make_request(KVM_REQ_EVENT, vcpu); + + return 1; +} + +static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + enum emulation_result err = EMULATE_DONE; + int ret = 1; + u32 cpu_exec_ctrl; + bool intr_window_requested; + + cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; + + while (!guest_state_valid(vcpu)) { + if (intr_window_requested + && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) + return handle_interrupt_window(&vmx->vcpu); + + err = emulate_instruction(vcpu, 0); + + if (err == EMULATE_DO_MMIO) { + ret = 0; + goto out; + } + + if (err != EMULATE_DONE) + return 0; + + if (signal_pending(current)) + goto out; + if (need_resched()) + schedule(); + } + + vmx->emulation_required = 0; +out: + return ret; +} + +/* + * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE + * exiting, so only get here on cpu with PAUSE-Loop-Exiting. + */ +static int handle_pause(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + kvm_vcpu_on_spin(vcpu); + + return 1; +} + +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + +/* + * The exit handlers return 1 if the exit was handled fully and guest execution + * may resume. Otherwise they set the kvm_run parameter to indicate what needs + * to be done to userspace and return 0. + */ +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { + [EXIT_REASON_EXCEPTION_NMI] = handle_exception, + [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, + [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, + [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, + [EXIT_REASON_IO_INSTRUCTION] = handle_io, + [EXIT_REASON_CR_ACCESS] = handle_cr, + [EXIT_REASON_DR_ACCESS] = handle_dr, + [EXIT_REASON_CPUID] = handle_cpuid, + [EXIT_REASON_MSR_READ] = handle_rdmsr, + [EXIT_REASON_MSR_WRITE] = handle_wrmsr, + [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, + [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_INVD] = handle_invd, + [EXIT_REASON_INVLPG] = handle_invlpg, + [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_VMCLEAR] = handle_vmx_insn, + [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, + [EXIT_REASON_VMPTRLD] = handle_vmx_insn, + [EXIT_REASON_VMPTRST] = handle_vmx_insn, + [EXIT_REASON_VMREAD] = handle_vmx_insn, + [EXIT_REASON_VMRESUME] = handle_vmx_insn, + [EXIT_REASON_VMWRITE] = handle_vmx_insn, + [EXIT_REASON_VMOFF] = handle_vmx_insn, + [EXIT_REASON_VMON] = handle_vmx_insn, + [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, + [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_XSETBV] = handle_xsetbv, + [EXIT_REASON_TASK_SWITCH] = handle_task_switch, + [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, +}; + +static const int kvm_vmx_max_exit_handlers = + ARRAY_SIZE(kvm_vmx_exit_handlers); + +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +{ + *info1 = vmcs_readl(EXIT_QUALIFICATION); + *info2 = vmcs_read32(VM_EXIT_INTR_INFO); +} + +/* + * The guest has exited. See if we can fix it or if we need userspace + * assistance. + */ +static int vmx_handle_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason = vmx->exit_reason; + u32 vectoring_info = vmx->idt_vectoring_info; + + trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); + + /* If guest state is invalid, start emulating */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return handle_invalid_guest_state(vcpu); + + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = exit_reason; + return 0; + } + + if (unlikely(vmx->fail)) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); + return 0; + } + + if ((vectoring_info & VECTORING_INFO_VALID_MASK) && + (exit_reason != EXIT_REASON_EXCEPTION_NMI && + exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_TASK_SWITCH)) + printk(KERN_WARNING "%s: unexpected, valid vectoring info " + "(0x%x) and exit reason is 0x%x\n", + __func__, vectoring_info, exit_reason); + + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { + if (vmx_interrupt_allowed(vcpu)) { + vmx->soft_vnmi_blocked = 0; + } else if (vmx->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + printk(KERN_WARNING "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->soft_vnmi_blocked = 0; + } + } + + if (exit_reason < kvm_vmx_max_exit_handlers + && kvm_vmx_exit_handlers[exit_reason]) + return kvm_vmx_exit_handlers[exit_reason](vcpu); + else { + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = exit_reason; + } + return 0; +} + +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + if (irr == -1 || tpr < irr) { + vmcs_write32(TPR_THRESHOLD, 0); + return; + } + + vmcs_write32(TPR_THRESHOLD, irr); +} + +static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info = vmx->exit_intr_info; + + /* Handle machine checks before interrupts are enabled */ + if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) + || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI + && is_machine_check(exit_intr_info))) + kvm_machine_check(); + + /* We need to handle NMIs before interrupts are enabled */ + if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && + (exit_intr_info & INTR_INFO_VALID_MASK)) { + kvm_before_handle_nmi(&vmx->vcpu); + asm("int $2"); + kvm_after_handle_nmi(&vmx->vcpu); + } +} + +static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info = vmx->exit_intr_info; + bool unblock_nmi; + u8 vector; + bool idtv_info_valid; + + idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; + + if (cpu_has_virtual_nmis()) { + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } else if (unlikely(vmx->soft_vnmi_blocked)) + vmx->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); +} + +static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, + u32 idt_vectoring_info, + int instr_len_field, + int error_code_field) +{ + u8 vector; + int type; + bool idtv_info_valid; + + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + + vmx->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&vmx->vcpu); + kvm_clear_interrupt_queue(&vmx->vcpu); + + if (!idtv_info_valid) + return; + + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; + + switch (type) { + case INTR_TYPE_NMI_INTR: + vmx->vcpu.arch.nmi_injected = true; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Clear bit "block by NMI" before VM entry if a NMI + * delivery faulted. + */ + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + break; + case INTR_TYPE_SOFT_EXCEPTION: + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(instr_len_field); + /* fall through */ + case INTR_TYPE_HARD_EXCEPTION: + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { + u32 err = vmcs_read32(error_code_field); + kvm_queue_exception_e(&vmx->vcpu, vector, err); + } else + kvm_queue_exception(&vmx->vcpu, vector); + break; + case INTR_TYPE_SOFT_INTR: + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(instr_len_field); + /* fall through */ + case INTR_TYPE_EXT_INTR: + kvm_queue_interrupt(&vmx->vcpu, vector, + type == INTR_TYPE_SOFT_INTR); + break; + default: + break; + } +} + +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, + VM_EXIT_INSTRUCTION_LEN, + IDT_VECTORING_ERROR_CODE); +} + +static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +{ + __vmx_complete_interrupts(to_vmx(vcpu), + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + VM_ENTRY_INSTRUCTION_LEN, + VM_ENTRY_EXCEPTION_ERROR_CODE); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); +} + +#ifdef CONFIG_X86_64 +#define R "r" +#define Q "q" +#else +#define R "e" +#define Q "l" +#endif + +static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) + vmx->entry_time = ktime_get(); + + /* Don't enter VMX if guest state is invalid, let the exit handler + start emulation until we arrive back to a valid state */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return; + + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + + /* When single-stepping over STI and MOV SS, we must clear the + * corresponding interruptibility bits in the guest state. Otherwise + * vmentry fails as it then expects bit 14 (BS) in pending debug + * exceptions being set, but that's not correct for the guest debugging + * case. */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmx_set_interrupt_shadow(vcpu, 0); + + asm( + /* Store host registers */ + "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" + "cmp %%"R"sp, %c[host_rsp](%0) \n\t" + "je 1f \n\t" + "mov %%"R"sp, %c[host_rsp](%0) \n\t" + __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" + "1: \n\t" + /* Reload cr2 if changed */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%cr2, %%"R"dx \n\t" + "cmp %%"R"ax, %%"R"dx \n\t" + "je 2f \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "2: \n\t" + /* Check if vmlaunch of vmresume is needed */ + "cmpl $0, %c[launched](%0) \n\t" + /* Load guest registers. Don't clobber flags. */ + "mov %c[rax](%0), %%"R"ax \n\t" + "mov %c[rbx](%0), %%"R"bx \n\t" + "mov %c[rdx](%0), %%"R"dx \n\t" + "mov %c[rsi](%0), %%"R"si \n\t" + "mov %c[rdi](%0), %%"R"di \n\t" + "mov %c[rbp](%0), %%"R"bp \n\t" +#ifdef CONFIG_X86_64 + "mov %c[r8](%0), %%r8 \n\t" + "mov %c[r9](%0), %%r9 \n\t" + "mov %c[r10](%0), %%r10 \n\t" + "mov %c[r11](%0), %%r11 \n\t" + "mov %c[r12](%0), %%r12 \n\t" + "mov %c[r13](%0), %%r13 \n\t" + "mov %c[r14](%0), %%r14 \n\t" + "mov %c[r15](%0), %%r15 \n\t" +#endif + "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ + + /* Enter guest mode */ + "jne .Llaunched \n\t" + __ex(ASM_VMX_VMLAUNCH) "\n\t" + "jmp .Lkvm_vmx_return \n\t" + ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" + ".Lkvm_vmx_return: " + /* Save guest registers, load host registers, keep flags */ + "xchg %0, (%%"R"sp) \n\t" + "mov %%"R"ax, %c[rax](%0) \n\t" + "mov %%"R"bx, %c[rbx](%0) \n\t" + "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "mov %%"R"dx, %c[rdx](%0) \n\t" + "mov %%"R"si, %c[rsi](%0) \n\t" + "mov %%"R"di, %c[rdi](%0) \n\t" + "mov %%"R"bp, %c[rbp](%0) \n\t" +#ifdef CONFIG_X86_64 + "mov %%r8, %c[r8](%0) \n\t" + "mov %%r9, %c[r9](%0) \n\t" + "mov %%r10, %c[r10](%0) \n\t" + "mov %%r11, %c[r11](%0) \n\t" + "mov %%r12, %c[r12](%0) \n\t" + "mov %%r13, %c[r13](%0) \n\t" + "mov %%r14, %c[r14](%0) \n\t" + "mov %%r15, %c[r15](%0) \n\t" +#endif + "mov %%cr2, %%"R"ax \n\t" + "mov %%"R"ax, %c[cr2](%0) \n\t" + + "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" + "setbe %c[fail](%0) \n\t" + : : "c"(vmx), "d"((unsigned long)HOST_RSP), + [launched]"i"(offsetof(struct vcpu_vmx, launched)), + [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), + [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), +#ifdef CONFIG_X86_64 + [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), + [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), + [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), +#endif + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) + : "cc", "memory" + , R"ax", R"bx", R"di", R"si" +#ifdef CONFIG_X86_64 + , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#endif + ); + + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_PDPTR) + | (1 << VCPU_EXREG_CR3)); + vcpu->arch.regs_dirty = 0; + + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + + asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + vmx->launched = 1; + + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + vmx_complete_atomic_exit(vmx); + vmx_recover_nmi_blocking(vmx); + vmx_complete_interrupts(vmx); +} + +#undef R +#undef Q + +static void vmx_free_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->vmcs) { + vcpu_clear(vmx); + free_vmcs(vmx->vmcs); + vmx->vmcs = NULL; + } +} + +static void vmx_free_vcpu(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + free_vpid(vmx); + vmx_free_vmcs(vcpu); + kfree(vmx->guest_msrs); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vmx); +} + +static inline void vmcs_init(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); + + if (!vmm_exclusive) + kvm_cpu_vmxon(phys_addr); + + vmcs_clear(vmcs); + + if (!vmm_exclusive) + kvm_cpu_vmxoff(); +} + +static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) +{ + int err; + struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + int cpu; + + if (!vmx) + return ERR_PTR(-ENOMEM); + + allocate_vpid(vmx); + + err = kvm_vcpu_init(&vmx->vcpu, kvm, id); + if (err) + goto free_vcpu; + + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vmx->guest_msrs) { + err = -ENOMEM; + goto uninit_vcpu; + } + + vmx->vmcs = alloc_vmcs(); + if (!vmx->vmcs) + goto free_msrs; + + vmcs_init(vmx->vmcs); + + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; + err = vmx_vcpu_setup(vmx); + vmx_vcpu_put(&vmx->vcpu); + put_cpu(); + if (err) + goto free_vmcs; + if (vm_need_virtualize_apic_accesses(kvm)) + if (alloc_apic_access_page(kvm) != 0) + goto free_vmcs; + + if (enable_ept) { + if (!kvm->arch.ept_identity_map_addr) + kvm->arch.ept_identity_map_addr = + VMX_EPT_IDENTITY_PAGETABLE_ADDR; + if (alloc_identity_pagetable(kvm) != 0) + goto free_vmcs; + } + + return &vmx->vcpu; + +free_vmcs: + free_vmcs(vmx->vmcs); +free_msrs: + kfree(vmx->guest_msrs); +uninit_vcpu: + kvm_vcpu_uninit(&vmx->vcpu); +free_vcpu: + free_vpid(vmx); + kmem_cache_free(kvm_vcpu_cache, vmx); + return ERR_PTR(err); +} + +static void __init vmx_check_processor_compat(void *rtn) +{ + struct vmcs_config vmcs_conf; + + *(int *)rtn = 0; + if (setup_vmcs_config(&vmcs_conf) < 0) + *(int *)rtn = -EIO; + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { + printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", + smp_processor_id()); + *(int *)rtn = -EIO; + } +} + +static int get_ept_level(void) +{ + return VMX_EPT_DEFAULT_GAW + 1; +} + +static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +{ + u64 ret; + + /* For VT-d and EPT combination + * 1. MMIO: always map as UC + * 2. EPT with VT-d: + * a. VT-d without snooping control feature: can't guarantee the + * result, try to trust guest. + * b. VT-d with snooping control feature: snooping control feature of + * VT-d engine can guarantee the cache correctness. Just set it + * to WB to keep consistent with host. So the same as item 3. + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep + * consistent with host MTRR + */ + if (is_mmio) + ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; + else if (vcpu->kvm->arch.iommu_domain && + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) + ret = kvm_get_guest_memory_type(vcpu, gfn) << + VMX_EPT_MT_EPTE_SHIFT; + else + ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) + | VMX_EPT_IPAT_BIT; + + return ret; +} + +#define _ER(x) { EXIT_REASON_##x, #x } + +static const struct trace_print_flags vmx_exit_reasons_str[] = { + _ER(EXCEPTION_NMI), + _ER(EXTERNAL_INTERRUPT), + _ER(TRIPLE_FAULT), + _ER(PENDING_INTERRUPT), + _ER(NMI_WINDOW), + _ER(TASK_SWITCH), + _ER(CPUID), + _ER(HLT), + _ER(INVLPG), + _ER(RDPMC), + _ER(RDTSC), + _ER(VMCALL), + _ER(VMCLEAR), + _ER(VMLAUNCH), + _ER(VMPTRLD), + _ER(VMPTRST), + _ER(VMREAD), + _ER(VMRESUME), + _ER(VMWRITE), + _ER(VMOFF), + _ER(VMON), + _ER(CR_ACCESS), + _ER(DR_ACCESS), + _ER(IO_INSTRUCTION), + _ER(MSR_READ), + _ER(MSR_WRITE), + _ER(MWAIT_INSTRUCTION), + _ER(MONITOR_INSTRUCTION), + _ER(PAUSE_INSTRUCTION), + _ER(MCE_DURING_VMENTRY), + _ER(TPR_BELOW_THRESHOLD), + _ER(APIC_ACCESS), + _ER(EPT_VIOLATION), + _ER(EPT_MISCONFIG), + _ER(WBINVD), + { -1, NULL } +}; + +#undef _ER + +static int vmx_get_lpage_level(void) +{ + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; +} + +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control; + + vmx->rdtscp_enabled = false; + if (vmx_rdtscp_supported()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + if (exec_control & SECONDARY_EXEC_RDTSCP) { + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } + } + } +} + +static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +{ +} + +static struct kvm_x86_ops vmx_x86_ops = { + .cpu_has_kvm_support = cpu_has_kvm_support, + .disabled_by_bios = vmx_disabled_by_bios, + .hardware_setup = hardware_setup, + .hardware_unsetup = hardware_unsetup, + .check_processor_compatibility = vmx_check_processor_compat, + .hardware_enable = hardware_enable, + .hardware_disable = hardware_disable, + .cpu_has_accelerated_tpr = report_flexpriority, + + .vcpu_create = vmx_create_vcpu, + .vcpu_free = vmx_free_vcpu, + .vcpu_reset = vmx_vcpu_reset, + + .prepare_guest_switch = vmx_save_host_state, + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + + .set_guest_debug = set_guest_debug, + .get_msr = vmx_get_msr, + .set_msr = vmx_set_msr, + .get_segment_base = vmx_get_segment_base, + .get_segment = vmx_get_segment, + .set_segment = vmx_set_segment, + .get_cpl = vmx_get_cpl, + .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, + .decache_cr3 = vmx_decache_cr3, + .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, + .set_cr0 = vmx_set_cr0, + .set_cr3 = vmx_set_cr3, + .set_cr4 = vmx_set_cr4, + .set_efer = vmx_set_efer, + .get_idt = vmx_get_idt, + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, + .set_dr7 = vmx_set_dr7, + .cache_reg = vmx_cache_reg, + .get_rflags = vmx_get_rflags, + .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, + .fpu_deactivate = vmx_fpu_deactivate, + + .tlb_flush = vmx_flush_tlb, + + .run = vmx_vcpu_run, + .handle_exit = vmx_handle_exit, + .skip_emulated_instruction = skip_emulated_instruction, + .set_interrupt_shadow = vmx_set_interrupt_shadow, + .get_interrupt_shadow = vmx_get_interrupt_shadow, + .patch_hypercall = vmx_patch_hypercall, + .set_irq = vmx_inject_irq, + .set_nmi = vmx_inject_nmi, + .queue_exception = vmx_queue_exception, + .cancel_injection = vmx_cancel_injection, + .interrupt_allowed = vmx_interrupt_allowed, + .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, + + .set_tss_addr = vmx_set_tss_addr, + .get_tdp_level = get_ept_level, + .get_mt_mask = vmx_get_mt_mask, + + .get_exit_info = vmx_get_exit_info, + .exit_reasons_str = vmx_exit_reasons_str, + + .get_lpage_level = vmx_get_lpage_level, + + .cpuid_update = vmx_cpuid_update, + + .rdtscp_supported = vmx_rdtscp_supported, + + .set_supported_cpuid = vmx_set_supported_cpuid, + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .write_tsc_offset = vmx_write_tsc_offset, + .adjust_tsc_offset = vmx_adjust_tsc_offset, + + .set_tdp_cr3 = vmx_set_cr3, +}; + +static int __init vmx_init(void) +{ + int r, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < NR_VMX_MSR; ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); + + vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_a) + return -ENOMEM; + + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_b) { + r = -ENOMEM; + goto out; + } + + vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy) { + r = -ENOMEM; + goto out1; + } + + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode) { + r = -ENOMEM; + goto out2; + } + + /* + * Allow direct access to the PC debug port (it is often used for I/O + * delays, but the vmexits simply slow things down). + */ + memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); + clear_bit(0x80, vmx_io_bitmap_a); + + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); + + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); + + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); + if (r) + goto out3; + + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + + if (enable_ept) { + bypass_guest_pf = 0; + kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, + VMX_EPT_EXECUTABLE_MASK); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); + + if (bypass_guest_pf) + kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); + + return 0; + +out3: + free_page((unsigned long)vmx_msr_bitmap_longmode); +out2: + free_page((unsigned long)vmx_msr_bitmap_legacy); +out1: + free_page((unsigned long)vmx_io_bitmap_b); +out: + free_page((unsigned long)vmx_io_bitmap_a); + return r; +} + +static void __exit vmx_exit(void) +{ + free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); + + kvm_exit(); +} + +module_init(vmx_init) +module_exit(vmx_exit) diff --git a/linux/x86/x86.c b/linux/x86/x86.c new file mode 100644 index 0000000..a6233d2 --- /dev/null +++ b/linux/x86/x86.c @@ -0,0 +1,6404 @@ +#ifndef KVM_UNIFDEF_H +#define KVM_UNIFDEF_H + +#ifdef __i386__ +#ifndef CONFIG_X86_32 +#define CONFIG_X86_32 1 +#endif +#endif + +#ifdef __x86_64__ +#ifndef CONFIG_X86_64 +#define CONFIG_X86_64 1 +#endif +#endif + +#if defined(__i386__) || defined (__x86_64__) +#ifndef CONFIG_X86 +#define CONFIG_X86 1 +#endif +#endif + +#ifdef __ia64__ +#ifndef CONFIG_IA64 +#define CONFIG_IA64 1 +#endif +#endif + +#ifdef __PPC__ +#ifndef CONFIG_PPC +#define CONFIG_PPC 1 +#endif +#endif + +#ifdef __s390__ +#ifndef CONFIG_S390 +#define CONFIG_S390 1 +#endif +#endif + +#endif +/* + * Kernel-based Virtual Machine driver for Linux + * + * derived from drivers/kvm/kvm_main.c + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * Amit Shah <amit.shah@qumranet.com> + * Ben-Ami Yassour <benami@il.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/kvm_host.h> +#include "irq.h" +#include "mmu.h" +#include "i8254.h" +#include "tss.h" +#include "kvm_cache_regs.h" +#include "x86.h" + + +#include <linux/interrupt.h> +#include <linux/kvm.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/mman.h> +#include <linux/highmem.h> +#include <linux/iommu.h> +#include <linux/intel-iommu.h> +#include <linux/cpufreq.h> +#include <linux/user-return-notifier.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <linux/uaccess.h> +#include <linux/hash.h> +#include <trace/events/kvm.h> + +#define CREATE_TRACE_POINTS +#include "trace.h" + +#include <asm/debugreg.h> +#include <asm/msr.h> +#include <asm/desc.h> +#include <asm/mtrr.h> +#include <asm/mce.h> +#include <asm/i387.h> +#include <asm/xcr.h> +#include <asm/pvclock.h> +#include <asm/div64.h> + +#define MAX_IO_MSRS 256 +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXSAVE \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + +#define KVM_MAX_MCE_BANKS 32 +#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) + +/* EFER defaults: + * - enable syscall per default because its emulated by KVM + * - enable LME and LMA per default on 64 bit KVM + */ +#ifdef CONFIG_X86_64 +static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; +#else +static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; +#endif + +#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM +#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU + +static void update_cr8_intercept(struct kvm_vcpu *vcpu); +static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *entries); + +struct kvm_x86_ops *kvm_x86_ops; +EXPORT_SYMBOL_GPL(kvm_x86_ops); + +int ignore_msrs = 0; +module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); + +#define KVM_NR_SHARED_MSRS 16 + +struct kvm_shared_msrs_global { + int nr; + u32 msrs[KVM_NR_SHARED_MSRS]; +}; + +struct kvm_shared_msrs { + struct kvm_user_return_notifier urn; + bool registered; + struct kvm_shared_msr_values { + u64 host; + u64 curr; + } values[KVM_NR_SHARED_MSRS]; +}; + +static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; +static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); + +struct kvm_stats_debugfs_item debugfs_entries[] = { + { "pf_fixed", VCPU_STAT(pf_fixed) }, + { "pf_guest", VCPU_STAT(pf_guest) }, + { "tlb_flush", VCPU_STAT(tlb_flush) }, + { "invlpg", VCPU_STAT(invlpg) }, + { "exits", VCPU_STAT(exits) }, + { "io_exits", VCPU_STAT(io_exits) }, + { "mmio_exits", VCPU_STAT(mmio_exits) }, + { "signal_exits", VCPU_STAT(signal_exits) }, + { "irq_window", VCPU_STAT(irq_window_exits) }, + { "nmi_window", VCPU_STAT(nmi_window_exits) }, + { "halt_exits", VCPU_STAT(halt_exits) }, + { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "hypercalls", VCPU_STAT(hypercalls) }, + { "request_irq", VCPU_STAT(request_irq_exits) }, + { "irq_exits", VCPU_STAT(irq_exits) }, + { "host_state_reload", VCPU_STAT(host_state_reload) }, + { "efer_reload", VCPU_STAT(efer_reload) }, + { "fpu_reload", VCPU_STAT(fpu_reload) }, + { "insn_emulation", VCPU_STAT(insn_emulation) }, + { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, + { "irq_injections", VCPU_STAT(irq_injections) }, + { "nmi_injections", VCPU_STAT(nmi_injections) }, + { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, + { "mmu_pte_write", VM_STAT(mmu_pte_write) }, + { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, + { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, + { "mmu_flooded", VM_STAT(mmu_flooded) }, + { "mmu_recycled", VM_STAT(mmu_recycled) }, + { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "mmu_unsync", VM_STAT(mmu_unsync) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, + { "largepages", VM_STAT(lpages) }, + { NULL } +}; + +u64 __read_mostly host_xcr0; + +static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) +{ + int i; + for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) + vcpu->arch.apf.gfns[i] = ~0; +} + +static void kvm_on_user_return(struct kvm_user_return_notifier *urn) +{ + unsigned slot; + struct kvm_shared_msrs *locals + = container_of(urn, struct kvm_shared_msrs, urn); + struct kvm_shared_msr_values *values; + + for (slot = 0; slot < shared_msrs_global.nr; ++slot) { + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], values->host); + values->curr = values->host; + } + } + locals->registered = false; + kvm_user_return_notifier_unregister(urn); +} + +static void shared_msr_update(unsigned slot, u32 msr) +{ + struct kvm_shared_msrs *smsr; + u64 value; + + smsr = &__get_cpu_var(shared_msrs); + /* only read, and nobody should modify it at this time, + * so don't need lock */ + if (slot >= shared_msrs_global.nr) { + printk(KERN_ERR "kvm: invalid MSR slot!"); + return; + } + rdmsrl_safe(msr, &value); + smsr->values[slot].host = value; + smsr->values[slot].curr = value; +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{ + if (slot >= shared_msrs_global.nr) + shared_msrs_global.nr = slot + 1; + shared_msrs_global.msrs[slot] = msr; + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); +} +EXPORT_SYMBOL_GPL(kvm_define_shared_msr); + +static void kvm_shared_msr_cpu_online(void) +{ + unsigned i; + + for (i = 0; i < shared_msrs_global.nr; ++i) + shared_msr_update(i, shared_msrs_global.msrs[i]); +} + +void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +{ + struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + + if (((value ^ smsr->values[slot].curr) & mask) == 0) + return; + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); + if (!smsr->registered) { + smsr->urn.on_user_return = kvm_on_user_return; + kvm_user_return_notifier_register(&smsr->urn); + smsr->registered = true; + } +} +EXPORT_SYMBOL_GPL(kvm_set_shared_msr); + +static void drop_user_return_notifiers(void *ignore) +{ + struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + + if (smsr->registered) + kvm_on_user_return(&smsr->urn); +} + +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) + return vcpu->arch.apic_base; + else + return vcpu->arch.apic_base; +} +EXPORT_SYMBOL_GPL(kvm_get_apic_base); + +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) +{ + /* TODO: reserve bits check */ + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_base(vcpu, data); + else + vcpu->arch.apic_base = data; +} +EXPORT_SYMBOL_GPL(kvm_set_apic_base); + +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 + +static int exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return EXCPT_PF; + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return EXCPT_CONTRIBUTORY; + default: + break; + } + return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code, + bool reinject) +{ + u32 prev_nr; + int class1, class2; + + kvm_make_request(KVM_REQ_EVENT, vcpu); + + if (!vcpu->arch.exception.pending) { + queue: + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.reinject = reinject; + return; + } + + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) + || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + goto queue; +} + +void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) +{ + kvm_multiple_exception(vcpu, nr, false, 0, false); +} +EXPORT_SYMBOL_GPL(kvm_queue_exception); + +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) +{ + kvm_multiple_exception(vcpu, nr, false, 0, true); +} +EXPORT_SYMBOL_GPL(kvm_requeue_exception); + +void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) +{ + if (err) + kvm_inject_gp(vcpu, 0); + else + kvm_x86_ops->skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); + +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +{ + ++vcpu->stat.pf_guest; + vcpu->arch.cr2 = fault->address; + kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); +} + +void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +{ + if (mmu_is_nested(vcpu) && !fault->nested_page_fault) + vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); + else + vcpu->arch.mmu.inject_page_fault(vcpu, fault); +} + +void kvm_inject_nmi(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu->arch.nmi_pending = 1; +} +EXPORT_SYMBOL_GPL(kvm_inject_nmi); + +void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) +{ + kvm_multiple_exception(vcpu, nr, true, error_code, false); +} +EXPORT_SYMBOL_GPL(kvm_queue_exception_e); + +void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) +{ + kvm_multiple_exception(vcpu, nr, true, error_code, true); +} +EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); + +/* + * Checks if cpl <= required_cpl; if true, return true. Otherwise queue + * a #GP and return false. + */ +bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) +{ + if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + return true; + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return false; +} +EXPORT_SYMBOL_GPL(kvm_require_cpl); + +/* + * This function will be used to read from the physical memory of the currently + * running guest. The difference to kvm_read_guest_page is that this function + * can read from guest physical or from the guest's guest physical memory. + */ +int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gfn_t ngfn, void *data, int offset, int len, + u32 access) +{ + gfn_t real_gfn; + gpa_t ngpa; + + ngpa = gfn_to_gpa(ngfn); + real_gfn = mmu->translate_gpa(vcpu, ngpa, access); + if (real_gfn == UNMAPPED_GVA) + return -EFAULT; + + real_gfn = gpa_to_gfn(real_gfn); + + return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); + +int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, + void *data, int offset, int len, u32 access) +{ + return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, + data, offset, len, access); +} + +/* + * Load the pae pdptrs. Return true is they are all valid. + */ +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) +{ + gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; + unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; + int i; + int ret; + u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; + + ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte), + PFERR_USER_MASK|PFERR_WRITE_MASK); + if (ret < 0) { + ret = 0; + goto out; + } + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { + if (is_present_gpte(pdpte[i]) && + (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { + ret = 0; + goto out; + } + } + ret = 1; + + memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); +out: + + return ret; +} +EXPORT_SYMBOL_GPL(load_pdptrs); + +static bool pdptrs_changed(struct kvm_vcpu *vcpu) +{ + u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; + bool changed = true; + int offset; + gfn_t gfn; + int r; + + if (is_long_mode(vcpu) || !is_pae(vcpu)) + return false; + + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + return true; + + gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; + offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); + r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), + PFERR_USER_MASK | PFERR_WRITE_MASK); + if (r < 0) + goto out; + changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; +out: + + return changed; +} + +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | + X86_CR0_CD | X86_CR0_NW; + + cr0 |= X86_CR0_ET; + +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) + return 1; +#endif + + cr0 &= ~CR0_RESERVED_BITS; + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) + return 1; + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) + return 1; + + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { +#ifdef CONFIG_X86_64 + if ((vcpu->arch.efer & EFER_LME)) { + int cs_db, cs_l; + + if (!is_pae(vcpu)) + return 1; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) + return 1; + } else +#endif + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, + kvm_read_cr3(vcpu))) + return 1; + } + + kvm_x86_ops->set_cr0(vcpu, cr0); + + if ((cr0 ^ old_cr0) & X86_CR0_PG) + kvm_clear_async_pf_completion_queue(vcpu); + + if ((cr0 ^ old_cr0) & update_bits) + kvm_mmu_reset_context(vcpu); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_cr0); + +void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +{ + (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); +} +EXPORT_SYMBOL_GPL(kvm_lmsw); + +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +{ + u64 xcr0; + + /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + xcr0 = xcr; + if (kvm_x86_ops->get_cpl(vcpu) != 0) + return 1; + if (!(xcr0 & XSTATE_FP)) + return 1; + if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) + return 1; + if (xcr0 & ~host_xcr0) + return 1; + vcpu->arch.xcr0 = xcr0; + vcpu->guest_xcr0_loaded = 0; + return 0; +} + +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +{ + if (__kvm_set_xcr(vcpu, index, xcr)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_xcr); + +static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_XSAVE)); +} + +static void update_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (!best) + return; + + /* Update OSXSAVE bit */ + if (kvm_cpu_has_xsave && best->function == 0x1) { + best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) + best->ecx |= bit(X86_FEATURE_OSXSAVE); + } +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + + if (cr4 & CR4_RESERVED_BITS) + return 1; + + if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) + return 1; + + if (is_long_mode(vcpu)) { + if (!(cr4 & X86_CR4_PAE)) + return 1; + } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) + && ((cr4 ^ old_cr4) & pdptr_bits) + && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, + kvm_read_cr3(vcpu))) + return 1; + + if (cr4 & X86_CR4_VMXE) + return 1; + + kvm_x86_ops->set_cr4(vcpu, cr4); + + if ((cr4 ^ old_cr4) & pdptr_bits) + kvm_mmu_reset_context(vcpu); + + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + update_cpuid(vcpu); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_cr4); + +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); + return 0; + } + + if (is_long_mode(vcpu)) { + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; + } else { + if (is_pae(vcpu)) { + if (cr3 & CR3_PAE_RESERVED_BITS) + return 1; + if (is_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + return 1; + } + /* + * We don't check reserved bits in nonpae mode, because + * this isn't enforced, and VMware depends on this. + */ + } + + /* + * Does the new cr3 value map to physical memory? (Note, we + * catch an invalid cr3 even in real-mode, because it would + * cause trouble later on when we turn on paging anyway.) + * + * A real CPU would silently accept an invalid cr3 and would + * attempt to use it - with largely undefined (and often hard + * to debug) behavior on the guest side. + */ + if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) + return 1; + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + vcpu->arch.mmu.new_cr3(vcpu); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_cr3); + +int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (cr8 & CR8_RESERVED_BITS) + return 1; + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_tpr(vcpu, cr8); + else + vcpu->arch.cr8 = cr8; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_cr8); + +unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) + return kvm_lapic_get_cr8(vcpu); + else + return vcpu->arch.cr8; +} +EXPORT_SYMBOL_GPL(kvm_get_cr8); + +static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +{ + switch (dr) { + case 0 ... 3: + vcpu->arch.db[dr] = val; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = val; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; /* #UD */ + /* fall through */ + case 6: + if (val & 0xffffffff00000000ULL) + return -1; /* #GP */ + vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; /* #UD */ + /* fall through */ + default: /* 7 */ + if (val & 0xffffffff00000000ULL) + return -1; /* #GP */ + vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); + vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); + } + break; + } + + return 0; +} + +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +{ + int res; + + res = __kvm_set_dr(vcpu, dr, val); + if (res > 0) + kvm_queue_exception(vcpu, UD_VECTOR); + else if (res < 0) + kvm_inject_gp(vcpu, 0); + + return res; +} +EXPORT_SYMBOL_GPL(kvm_set_dr); + +static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +{ + switch (dr) { + case 0 ... 3: + *val = vcpu->arch.db[dr]; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; + /* fall through */ + case 6: + *val = vcpu->arch.dr6; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; + /* fall through */ + default: /* 7 */ + *val = vcpu->arch.dr7; + break; + } + + return 0; +} + +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +{ + if (_kvm_get_dr(vcpu, dr, val)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_dr); + +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * + * This list is modified at module load time to reflect the + * capabilities of the host cpu. This capabilities test skips MSRs that are + * kvm-specific. Those are put in the beginning of the list. + */ + +#define KVM_SAVE_MSRS_BEGIN 8 +static u32 msrs_to_save[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA +}; + +static unsigned num_msrs_to_save; + +static u32 emulated_msrs[] = { + MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, +}; + +static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + u64 old_efer = vcpu->arch.efer; + + if (efer & efer_reserved_bits) + return 1; + + if (is_paging(vcpu) + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) + return 1; + + if (efer & EFER_FFXSR) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) + return 1; + } + + if (efer & EFER_SVME) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) + return 1; + } + + efer &= ~EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; + + kvm_x86_ops->set_efer(vcpu, efer); + + vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; + + /* Update reserved bits */ + if ((efer ^ old_efer) & EFER_NX) + kvm_mmu_reset_context(vcpu); + + return 0; +} + +void kvm_enable_efer_bits(u64 mask) +{ + efer_reserved_bits &= ~mask; +} +EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); + + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + return kvm_x86_ops->set_msr(vcpu, msr_index, data); +} + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return kvm_set_msr(vcpu, index, *data); +} + +static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) +{ + int version; + int r; + struct pvclock_wall_clock wc; + struct timespec boot; + + if (!wall_clock) + return; + + r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); + if (r) + return; + + if (version & 1) + ++version; /* first time write, random junk */ + + ++version; + + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); + + /* + * The guest calculates current wall clock time by adding + * system time (updated by kvm_guest_time_update below) to the + * wall clock specified here. guest system time equals host + * system time for us, thus we must fill in host boot time here. + */ + kvm_getboottime(&boot); + + wc.sec = boot.tv_sec; + wc.nsec = boot.tv_nsec; + wc.version = version; + + kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); + + version++; + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); +} + +static uint32_t div_frac(uint32_t dividend, uint32_t divisor) +{ + uint32_t quotient, remainder; + + /* Don't try to replace with do_div(), this one calculates + * "(dividend << 32) / divisor" */ + __asm__ ( "divl %4" + : "=a" (quotient), "=d" (remainder) + : "0" (0), "1" (dividend), "r" (divisor) ); + return quotient; +} + +static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, + s8 *pshift, u32 *pmultiplier) +{ + uint64_t scaled64; + int32_t shift = 0; + uint64_t tps64; + uint32_t tps32; + + tps64 = base_khz * 1000LL; + scaled64 = scaled_khz * 1000LL; + while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { + tps64 >>= 1; + shift--; + } + + tps32 = (uint32_t)tps64; + while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { + if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) + scaled64 >>= 1; + else + tps32 <<= 1; + shift++; + } + + *pshift = shift; + *pmultiplier = div_frac(scaled64, tps32); + + pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", + __func__, base_khz, scaled_khz, shift, *pmultiplier); +} + +static inline u64 get_kernel_ns(void) +{ + struct timespec ts; + + WARN_ON(preemptible()); + ktime_get_ts(&ts); + kvm_monotonic_to_bootbased(&ts); + return timespec_to_ns(&ts); +} + +static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); +unsigned long max_tsc_khz; + +static inline int kvm_tsc_changes_freq(void) +{ + int cpu = get_cpu(); + int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + cpufreq_quick_get(cpu) != 0; + put_cpu(); + return ret; +} + +static inline u64 nsec_to_cycles(u64 nsec) +{ + u64 ret; + + WARN_ON(preemptible()); + if (kvm_tsc_changes_freq()) + printk_once(KERN_WARNING + "kvm: unreliable cycle conversion on adjustable rate TSC\n"); + ret = nsec * kvm___this_cpu_read(cpu_tsc_khz); + do_div(ret, USEC_PER_SEC); + return ret; +} + +static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) +{ + /* Compute a scale to convert nanoseconds in TSC cycles */ + kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, + &kvm->arch.virtual_tsc_shift, + &kvm->arch.virtual_tsc_mult); + kvm->arch.virtual_tsc_khz = this_tsc_khz; +} + +static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) +{ + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, + vcpu->kvm->arch.virtual_tsc_mult, + vcpu->kvm->arch.virtual_tsc_shift); + tsc += vcpu->arch.last_tsc_write; + return tsc; +} + +void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + u64 offset, ns, elapsed; + unsigned long flags; + s64 sdiff; + + spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + offset = data - kvm_native_read_tsc(); + ns = get_kernel_ns(); + elapsed = ns - kvm->arch.last_tsc_nsec; + sdiff = data - kvm->arch.last_tsc_write; + if (sdiff < 0) + sdiff = -sdiff; + + /* + * Special case: close write to TSC within 5 seconds of + * another CPU is interpreted as an attempt to synchronize + * The 5 seconds is to accomodate host load / swapping as + * well as any reset of TSC during the boot process. + * + * In that case, for a reliable TSC, we can match TSC offsets, + * or make a best guest using elapsed value. + */ + if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && + elapsed < 5ULL * NSEC_PER_SEC) { + if (!kvm_check_tsc_unstable()) { + offset = kvm->arch.last_tsc_offset; + pr_debug("kvm: matched tsc offset for %llu\n", data); + } else { + u64 delta = nsec_to_cycles(elapsed); + offset += delta; + pr_debug("kvm: adjusted tsc offset by %llu\n", delta); + } + ns = kvm->arch.last_tsc_nsec; + } + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = data; + kvm->arch.last_tsc_offset = offset; + kvm_x86_ops->write_tsc_offset(vcpu, offset); + spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + /* Reset of TSC must disable overshoot protection below */ + vcpu->arch.hv_clock.tsc_timestamp = 0; + vcpu->arch.last_tsc_write = data; + vcpu->arch.last_tsc_nsec = ns; +} +EXPORT_SYMBOL_GPL(kvm_write_tsc); + +static int kvm_guest_time_update(struct kvm_vcpu *v) +{ + unsigned long flags; + struct kvm_vcpu_arch *vcpu = &v->arch; + void *shared_kaddr; + unsigned long this_tsc_khz; + s64 kernel_ns, max_kernel_ns; + u64 tsc_timestamp; + + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); + kernel_ns = get_kernel_ns(); + this_tsc_khz = kvm___this_cpu_read(cpu_tsc_khz); + + if (unlikely(this_tsc_khz == 0)) { + local_irq_restore(flags); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); + return 1; + } + + /* + * We may have to catch up the TSC to match elapsed wall clock + * time for two reasons, even if kvmclock is used. + * 1) CPU could have been running below the maximum TSC rate + * 2) Broken TSC compensation resets the base at each VCPU + * entry to avoid unknown leaps of TSC even when running + * again on the same CPU. This may cause apparent elapsed + * time to disappear, and the guest to stand still or run + * very slowly. + */ + if (vcpu->tsc_catchup) { + u64 tsc = compute_guest_tsc(v, kernel_ns); + if (tsc > tsc_timestamp) { + kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + tsc_timestamp = tsc; + } + } + + local_irq_restore(flags); + + if (!vcpu->time_page) + return 0; + + /* + * Time as measured by the TSC may go backwards when resetting the base + * tsc_timestamp. The reason for this is that the TSC resolution is + * higher than the resolution of the other clock scales. Thus, many + * possible measurments of the TSC correspond to one measurement of any + * other clock, and so a spread of values is possible. This is not a + * problem for the computation of the nanosecond clock; with TSC rates + * around 1GHZ, there can only be a few cycles which correspond to one + * nanosecond value, and any path through this code will inevitably + * take longer than that. However, with the kernel_ns value itself, + * the precision may be much lower, down to HZ granularity. If the + * first sampling of TSC against kernel_ns ends in the low part of the + * range, and the second in the high end of the range, we can get: + * + * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new + * + * As the sampling errors potentially range in the thousands of cycles, + * it is possible such a time value has already been observed by the + * guest. To protect against this, we must compute the system time as + * observed by the guest and ensure the new system time is greater. + */ + max_kernel_ns = 0; + if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + max_kernel_ns = vcpu->last_guest_tsc - + vcpu->hv_clock.tsc_timestamp; + max_kernel_ns = pvclock_scale_delta(max_kernel_ns, + vcpu->hv_clock.tsc_to_system_mul, + vcpu->hv_clock.tsc_shift); + max_kernel_ns += vcpu->last_kernel_ns; + } + + if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, + &vcpu->hv_clock.tsc_shift, + &vcpu->hv_clock.tsc_to_system_mul); + vcpu->hw_tsc_khz = this_tsc_khz; + } + + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; + + /* With all the info we got, fill in the values */ + vcpu->hv_clock.tsc_timestamp = tsc_timestamp; + vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + vcpu->last_kernel_ns = kernel_ns; + vcpu->last_guest_tsc = tsc_timestamp; + vcpu->hv_clock.flags = 0; + + /* + * The interface expects us to write an even number signaling that the + * update is finished. Since the guest won't see the intermediate + * state, we just increase by 2 at the end. + */ + vcpu->hv_clock.version += 2; + + shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); + + memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); + + kunmap_atomic(shared_kaddr, KM_USER0); + + mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); + return 0; +} + +static bool msr_mtrr_valid(unsigned msr) +{ + switch (msr) { + case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MSR_MTRRfix64K_00000: + case MSR_MTRRfix16K_80000: + case MSR_MTRRfix16K_A0000: + case MSR_MTRRfix4K_C0000: + case MSR_MTRRfix4K_C8000: + case MSR_MTRRfix4K_D0000: + case MSR_MTRRfix4K_D8000: + case MSR_MTRRfix4K_E0000: + case MSR_MTRRfix4K_E8000: + case MSR_MTRRfix4K_F0000: + case MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_CR_PAT: + return true; + case 0x2f8: + return true; + } + return false; +} + +static bool valid_pat_type(unsigned t) +{ + return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ +} + +static bool valid_mtrr_type(unsigned t) +{ + return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ +} + +static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int i; + + if (!msr_mtrr_valid(msr)) + return false; + + if (msr == MSR_IA32_CR_PAT) { + for (i = 0; i < 8; i++) + if (!valid_pat_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } else if (msr == MSR_MTRRdefType) { + if (data & ~0xcff) + return false; + return valid_mtrr_type(data & 0xff); + } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { + for (i = 0; i < 8 ; i++) + if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } + + /* variable MTRRs */ + return valid_mtrr_type(data & 0xff); +} + +static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + + if (!mtrr_valid(vcpu, msr, data)) + return 1; + + if (msr == MSR_MTRRdefType) { + vcpu->arch.mtrr_state.def_type = data; + vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; + } else if (msr == MSR_MTRRfix64K_00000) + p[0] = data; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + p[1 + msr - MSR_MTRRfix16K_80000] = data; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + p[3 + msr - MSR_MTRRfix4K_C0000] = data; + else if (msr == MSR_IA32_CR_PAT) + vcpu->arch.pat = data; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pt = data; + } + + kvm_mmu_reset_context(vcpu); + return 0; +} + +static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + + switch (msr) { + case MSR_IA32_MCG_STATUS: + vcpu->arch.mcg_status = data; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P)) + return 1; + if (data != 0 && data != ~(u64)0) + return -1; + vcpu->arch.mcg_ctl = data; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + u32 offset = msr - MSR_IA32_MC0_CTL; + /* only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore + * this to avoid an uncatched #GP in the guest + */ + if ((offset & 0x3) == 0 && + data != 0 && (data | (1 << 10)) != ~(u64)0) + return -1; + vcpu->arch.mce_banks[offset] = data; + break; + } + return 1; + } + return 0; +} + +static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + int lm = is_long_mode(vcpu); + u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 + : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; + u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 + : kvm->arch.xen_hvm_config.blob_size_32; + u32 page_num = data & ~PAGE_MASK; + u64 page_addr = data & PAGE_MASK; + u8 *page; + int r; + + r = -E2BIG; + if (page_num >= blob_size) + goto out; + r = -ENOMEM; + page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!page) + goto out; + r = -EFAULT; + if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE)) + goto out_free; + if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) + goto out_free; + r = 0; +out_free: + kfree(page); +out: + return r; +} + +static bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = true; + break; + } + + return r; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copy_to_user((void *)addr, instructions, 4)) + return 1; + kvm->arch.hv_hypercall = data; + break; + } + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + return 0; +} + +static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + addr = gfn_to_hva(vcpu->kvm, data >> + HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + if (kvm_is_error_hva(addr)) + return 1; + if (clear_user((void *)addr, PAGE_SIZE)) + return 1; + vcpu->arch.hv_vapic = data; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + + return 0; +} + +static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) +{ + gpa_t gpa = data & ~0x3f; + + /* Bits 2:5 are resrved, Should be zero */ + if (data & 0x3c) + return 1; + + vcpu->arch.apf.msr_val = data; + + if (!(data & KVM_ASYNC_PF_ENABLED)) { + kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + return 0; + } + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) + return 1; + + vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + kvm_async_pf_wakeup_all(vcpu); + return 0; +} + +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { + case MSR_EFER: + return set_efer(vcpu, data); + case MSR_K7_HWCR: + data &= ~(u64)0x40; /* ignore flush filter disable */ + data &= ~(u64)0x100; /* ignore ignne emulation enable */ + if (data != 0) { + pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", + data); + return 1; + } + break; + case MSR_FAM10H_MMIO_CONF_BASE: + if (data != 0) { + pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%llx\n", data); + return 1; + } + break; + case MSR_AMD64_NB_CFG: + break; + case MSR_IA32_DEBUGCTLMSR: + if (!data) { + /* We support the non-activated case already */ + break; + } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { + /* Values other than LBR and BTF are vendor-specific, + thus reserved and should throw a #GP */ + return 1; + } + pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); + break; + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: + break; + case 0x200 ... 0x2ff: + return set_msr_mtrr(vcpu, msr, data); + case MSR_IA32_APICBASE: + kvm_set_apic_base(vcpu, data); + break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return kvm_x2apic_msr_write(vcpu, msr, data); + case MSR_IA32_MISC_ENABLE: + vcpu->arch.ia32_misc_enable_msr = data; + break; + case MSR_KVM_WALL_CLOCK_NEW: + case MSR_KVM_WALL_CLOCK: + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data); + break; + case MSR_KVM_SYSTEM_TIME_NEW: + case MSR_KVM_SYSTEM_TIME: { + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + vcpu->arch.time = data; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + + /* we verify if the enable bit is set... */ + if (!(data & 1)) + break; + + /* ...but clean it before doing the actual write */ + vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + + vcpu->arch.time_page = + gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); + + if (is_error_page(vcpu->arch.time_page)) { + kvm_release_page_clean(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + break; + } + case MSR_KVM_ASYNC_PF_EN: + if (kvm_pv_enable_async_pf(vcpu, data)) + return 1; + break; + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return set_msr_mce(vcpu, msr, data); + + /* Performance counters are not protected by a CPUID bit, + * so we should check all of them in the generic path for the sake of + * cross vendor migration. + * Writing a zero into the event select MSRs disables them, + * which we perfectly emulate ;-). Any other value should be at least + * reported, some guests depend on them. + */ + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + if (data != 0) + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; + /* at least RHEL 4 unconditionally writes to the perfctr registers, + * so we ignore writes to make it happy. + */ + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; + case MSR_K7_CLK_CTL: + /* + * Ignore all writes to this no longer documented MSR. + * Writes are only relevant for old K7 processors, + * all pre-dating SVM, but a recommended workaround from + * AMD for these chips. It is possible to speicify the + * affected processor models on the command line, hence + * the need to ignore the workaround. + */ + break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return set_msr_hyperv(vcpu, msr, data); + break; + default: + if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) + return xen_hvm_config(vcpu, data); + if (!ignore_msrs) { + pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + msr, data); + return 1; + } else { + pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + msr, data); + break; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_msr_common); + + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); +} + +static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + + if (!msr_mtrr_valid(msr)) + return 1; + + if (msr == MSR_MTRRdefType) + *pdata = vcpu->arch.mtrr_state.def_type + + (vcpu->arch.mtrr_state.enabled << 10); + else if (msr == MSR_MTRRfix64K_00000) + *pdata = p[0]; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; + else if (msr == MSR_IA32_CR_PAT) + *pdata = vcpu->arch.pat; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pdata = *pt; + } + + return 0; +} + +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data; + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + + switch (msr) { + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + data = 0; + break; + case MSR_IA32_MCG_CAP: + data = vcpu->arch.mcg_cap; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P)) + return 1; + data = vcpu->arch.mcg_ctl; + break; + case MSR_IA32_MCG_STATUS: + data = vcpu->arch.mcg_status; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + u32 offset = msr - MSR_IA32_MC0_CTL; + data = vcpu->arch.mce_banks[offset]; + break; + } + return 1; + } + *pdata = data; + return 0; +} + +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data; + + switch (msr) { + case MSR_IA32_PLATFORM_ID: + case MSR_IA32_UCODE_REV: + case MSR_IA32_EBL_CR_POWERON: + case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_LASTBRANCHFROMIP: + case MSR_IA32_LASTBRANCHTOIP: + case MSR_IA32_LASTINTFROMIP: + case MSR_IA32_LASTINTTOIP: + case MSR_K8_SYSCFG: + case MSR_K7_HWCR: + case MSR_VM_HSAVE_PA: + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_PERFCTR0: + case MSR_K8_INT_PENDING_MSG: + case MSR_AMD64_NB_CFG: + case MSR_FAM10H_MMIO_CONF_BASE: + data = 0; + break; + case MSR_MTRRcap: + data = 0x500 | KVM_NR_VAR_MTRR; + break; + case 0x200 ... 0x2ff: + return get_msr_mtrr(vcpu, msr, pdata); + case 0xcd: /* fsb frequency */ + data = 3; + break; + /* + * MSR_EBC_FREQUENCY_ID + * Conservative value valid for even the basic CPU models. + * Models 0,1: 000 in bits 23:21 indicating a bus speed of + * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, + * and 266MHz for model 3, or 4. Set Core Clock + * Frequency to System Bus Frequency Ratio to 1 (bits + * 31:24) even though these are only valid for CPU + * models > 2, however guests may end up dividing or + * multiplying by zero otherwise. + */ + case MSR_EBC_FREQUENCY_ID: + data = 1 << 24; + break; + case MSR_IA32_APICBASE: + data = kvm_get_apic_base(vcpu); + break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return kvm_x2apic_msr_read(vcpu, msr, pdata); + break; + case MSR_IA32_MISC_ENABLE: + data = vcpu->arch.ia32_misc_enable_msr; + break; + case MSR_IA32_PERF_STATUS: + /* TSC increment by tick */ + data = 1000ULL; + /* CPU multiplier */ + data |= (((uint64_t)4ULL) << 40); + break; + case MSR_EFER: + data = vcpu->arch.efer; + break; + case MSR_KVM_WALL_CLOCK: + case MSR_KVM_WALL_CLOCK_NEW: + data = vcpu->kvm->arch.wall_clock; + break; + case MSR_KVM_SYSTEM_TIME: + case MSR_KVM_SYSTEM_TIME_NEW: + data = vcpu->arch.time; + break; + case MSR_KVM_ASYNC_PF_EN: + data = vcpu->arch.apf.msr_val; + break; + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return get_msr_mce(vcpu, msr, pdata); + case MSR_K7_CLK_CTL: + /* + * Provide expected ramp-up count for K7. All other + * are set to zero, indicating minimum divisors for + * every field. + * + * This prevents guest kernels on AMD host with CPU + * type 6, model 8 and higher from exploding due to + * the rdmsr failing. + */ + data = 0x20000000; + break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return get_msr_hyperv(vcpu, msr, pdata); + break; + default: + if (!ignore_msrs) { + pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + return 1; + } else { + pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); + data = 0; + } + break; + } + *pdata = data; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_msr_common); + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data)) +{ + int i, idx; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + for (i = 0; i < msrs->nmsrs; ++i) + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return i; +} + +/* + * Read or write a bunch of msrs. Parameters are user addresses. + * + * @return number of msrs set successfully. + */ +static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *user_msrs, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data), + int writeback) +{ + struct kvm_msrs msrs; + struct kvm_msr_entry *entries; + int r, n; + unsigned size; + + r = -EFAULT; + if (copy_from_user(&msrs, user_msrs, sizeof msrs)) + goto out; + + r = -E2BIG; + if (msrs.nmsrs >= MAX_IO_MSRS) + goto out; + + r = -ENOMEM; + size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; + entries = kmalloc(size, GFP_KERNEL); + if (!entries) + goto out; + + r = -EFAULT; + if (copy_from_user(entries, user_msrs->entries, size)) + goto out_free; + + r = n = __msr_io(vcpu, &msrs, entries, do_msr); + if (r < 0) + goto out_free; + + r = -EFAULT; + if (writeback && copy_to_user(user_msrs->entries, entries, size)) + goto out_free; + + r = n; + +out_free: + kfree(entries); +out: + return r; +} + +int kvm_dev_ioctl_check_extension(long ext) +{ + int r; + + switch (ext) { + case KVM_CAP_IRQCHIP: + case KVM_CAP_HLT: + case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: + case KVM_CAP_SET_TSS_ADDR: + case KVM_CAP_EXT_CPUID: + case KVM_CAP_CLOCKSOURCE: + case KVM_CAP_PIT: + case KVM_CAP_NOP_IO_DELAY: + case KVM_CAP_MP_STATE: +#ifdef CONFIG_MMU_NOTIFIER + case KVM_CAP_SYNC_MMU: +#endif + case KVM_CAP_USER_NMI: + case KVM_CAP_REINJECT_CONTROL: + case KVM_CAP_IRQ_INJECT_STATUS: + case KVM_CAP_ASSIGN_DEV_IRQ: +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33) + case KVM_CAP_IRQFD: + case KVM_CAP_IOEVENTFD: +#endif + case KVM_CAP_PIT2: + case KVM_CAP_PIT_STATE2: + case KVM_CAP_SET_IDENTITY_MAP_ADDR: + case KVM_CAP_XEN_HVM: + case KVM_CAP_ADJUST_CLOCK: + case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_HYPERV: + case KVM_CAP_HYPERV_VAPIC: + case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_PCI_SEGMENT: + case KVM_CAP_DEBUGREGS: + case KVM_CAP_X86_ROBUST_SINGLESTEP: + case KVM_CAP_XSAVE: +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38) + case KVM_CAP_ASYNC_PF: +#endif + r = 1; + break; + case KVM_CAP_COALESCED_MMIO: + r = KVM_COALESCED_MMIO_PAGE_OFFSET; + break; + case KVM_CAP_VAPIC: + r = !kvm_x86_ops->cpu_has_accelerated_tpr(); + break; + case KVM_CAP_NR_VCPUS: + r = KVM_MAX_VCPUS; + break; + case KVM_CAP_NR_MEMSLOTS: + r = KVM_MEMORY_SLOTS; + break; + case KVM_CAP_PV_MMU: /* obsolete */ + r = 0; + break; + case KVM_CAP_IOMMU: + r = iommu_found(); + break; + case KVM_CAP_MCE: + r = KVM_MAX_MCE_BANKS; + break; + case KVM_CAP_XCRS: + r = kvm_cpu_has_xsave; + break; + default: + r = 0; + break; + } + return r; + +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void *argp = (void *)arg; + long r; + + switch (ioctl) { + case KVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list *user_msr_list = argp; + struct kvm_msr_list msr_list; + unsigned n; + + r = -EFAULT; + if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) + goto out; + n = msr_list.nmsrs; + msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) + goto out; + r = -E2BIG; + if (n < msr_list.nmsrs) + goto out; + r = -EFAULT; + if (copy_to_user(user_msr_list->indices, &msrs_to_save, + num_msrs_to_save * sizeof(u32))) + goto out; + if (copy_to_user(user_msr_list->indices + num_msrs_to_save, + &emulated_msrs, + ARRAY_SIZE(emulated_msrs) * sizeof(u32))) + goto out; + r = 0; + break; + } + case KVM_GET_SUPPORTED_CPUID: { + struct kvm_cpuid2 *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, + cpuid_arg->entries); + if (r) + goto out; + + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + goto out; + r = 0; + break; + } + case KVM_X86_GET_MCE_CAP_SUPPORTED: { + u64 mce_cap; + + mce_cap = KVM_MCE_CAP_SUPPORTED; + r = -EFAULT; + if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + goto out; + r = 0; + break; + } + default: + r = -EINVAL; + } +out: + return r; +} + +static void wbinvd_ipi(void *garbage) +{ + wbinvd(); +} + +static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.iommu_domain && + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + /* Address WBINVD may be executed by guest */ + if (need_emulate_wbinvd(vcpu)) { + if (kvm_x86_ops->has_wbinvd_exit()) + cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); + else if (vcpu->cpu != -1 && vcpu->cpu != cpu) + smp_call_function_single(vcpu->cpu, + wbinvd_ipi, NULL, 1); + } + + kvm_x86_ops->vcpu_load(vcpu, cpu); + if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { + /* Make sure TSC doesn't go backwards */ + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + kvm_native_read_tsc() - vcpu->arch.last_host_tsc; + if (tsc_delta < 0) + mark_tsc_unstable("KVM discovered backwards TSC"); + if (kvm_check_tsc_unstable()) { + kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); + vcpu->arch.tsc_catchup = 1; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + if (vcpu->cpu != cpu) + kvm_migrate_timers(vcpu); + vcpu->cpu = cpu; + } +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->vcpu_put(vcpu); + kvm_put_guest_fpu(vcpu); + vcpu->arch.last_host_tsc = kvm_native_read_tsc(); +} + +static int is_efer_nx(void) +{ + unsigned long long efer = 0; + + rdmsrl_safe(MSR_EFER, &efer); + return efer & EFER_NX; +} + +static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_cpuid_entry2 *e, *entry; + + entry = NULL; + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + e = &vcpu->arch.cpuid_entries[i]; + if (e->function == 0x80000001) { + entry = e; + break; + } + } + if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { + entry->edx &= ~(1 << 20); + printk(KERN_INFO "kvm: guest NX capability removed\n"); + } +} + +/* when an old userspace process fills a new kernel module */ +static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry *entries) +{ + int r, i; + struct kvm_cpuid_entry *cpuid_entries; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); + if (!cpuid_entries) + goto out; + r = -EFAULT; + if (copy_from_user(cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out_free; + for (i = 0; i < cpuid->nent; i++) { + vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; + vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; + vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; + vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; + vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; + vcpu->arch.cpuid_entries[i].index = 0; + vcpu->arch.cpuid_entries[i].flags = 0; + vcpu->arch.cpuid_entries[i].padding[0] = 0; + vcpu->arch.cpuid_entries[i].padding[1] = 0; + vcpu->arch.cpuid_entries[i].padding[2] = 0; + } + vcpu->arch.cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); + r = 0; + kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + update_cpuid(vcpu); + +out_free: + vfree(cpuid_entries); +out: + return r; +} + +static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -EFAULT; + if (copy_from_user(&vcpu->arch.cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + vcpu->arch.cpuid_nent = cpuid->nent; + kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + update_cpuid(vcpu); + return 0; + +out: + return r; +} + +static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent < vcpu->arch.cpuid_nent) + goto out; + r = -EFAULT; + if (copy_to_user(entries, &vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + return 0; + +out: + cpuid->nent = vcpu->arch.cpuid_nent; + return r; +} + +static void cpuid_mask(u32 *word, int wordnum) +{ + *word &= boot_cpu_data.x86_capability[wordnum]; +} + +static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index) +{ + entry->function = function; + entry->index = index; + cpuid_count(entry->function, entry->index, + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + entry->flags = 0; +} + +#define F(x) bit(X86_FEATURE_##x) + +static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) +{ + unsigned f_nx = is_efer_nx() ? F(NX) : 0; +#ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; + unsigned f_lm = F(LM); +#else + unsigned f_gbpages = 0; + unsigned f_lm = 0; +#endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; + + /* cpuid 1.edx */ + const u32 kvm_supported_word0_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + 0 /* Reserved, DS, ACPI */ | F(MMX) | + F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | + 0 /* HTT, TM, Reserved, PBE */; + /* cpuid 0x80000001.edx */ + const u32 kvm_supported_word1_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* Reserved */ | + f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | + 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); + /* cpuid 1.ecx */ + const u32 kvm_supported_word4_x86_features = + F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | + 0 /* DS-CPL, VMX, SMX, EST */ | + 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | + 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | + 0 /* Reserved, DCA */ | F(XMM4_1) | + F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | + 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | + F(F16C); + /* cpuid 0x80000001.ecx */ + const u32 kvm_supported_word6_x86_features = + F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | + F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); + do_cpuid_1_ent(entry, function, index); + ++*nent; + + switch (function) { + case 0: + entry->eax = min(entry->eax, (u32)0xd); + break; + case 1: + entry->edx &= kvm_supported_word0_x86_features; + cpuid_mask(&entry->edx, 0); + entry->ecx &= kvm_supported_word4_x86_features; + cpuid_mask(&entry->ecx, 4); + /* we support x2apic emulation even if host does not support + * it since we emulate x2apic in software */ + entry->ecx |= F(X2APIC); + break; + /* function 2 entries are STATEFUL. That is, repeated cpuid commands + * may return different values. This forces us to get_cpu() before + * issuing the first command, and also to emulate this annoying behavior + * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ + case 2: { + int t, times = entry->eax & 0xff; + + entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + for (t = 1; t < times && *nent < maxnent; ++t) { + do_cpuid_1_ent(&entry[t], function, 0); + entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + ++*nent; + } + break; + } + /* function 4 and 0xb have additional index. */ + case 4: { + int i, cache_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until cache_type is zero */ + for (i = 1; *nent < maxnent; ++i) { + cache_type = entry[i - 1].eax & 0x1f; + if (!cache_type) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0xb: { + int i, level_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until level_type is zero */ + for (i = 1; *nent < maxnent; ++i) { + level_type = entry[i - 1].ecx & 0xff00; + if (!level_type) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0xd: { + int i; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (i = 1; *nent < maxnent; ++i) { + if (entry[i - 1].eax == 0 && i != 2) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case KVM_CPUID_SIGNATURE: { + char signature[12] = "KVMKVMKVM\0\0"; + u32 *sigptr = (u32 *)signature; + entry->eax = 0; + entry->ebx = sigptr[0]; + entry->ecx = sigptr[1]; + entry->edx = sigptr[2]; + break; + } + case KVM_CPUID_FEATURES: + entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | + (1 << KVM_FEATURE_NOP_IO_DELAY) | + (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; + case 0x80000000: + entry->eax = min(entry->eax, 0x8000001a); + break; + case 0x80000001: + entry->edx &= kvm_supported_word1_x86_features; + cpuid_mask(&entry->edx, 1); + entry->ecx &= kvm_supported_word6_x86_features; + cpuid_mask(&entry->ecx, 6); + break; + } + + kvm_x86_ops->set_supported_cpuid(function, entry); + + put_cpu(); +} + +#undef F + +static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *entries) +{ + struct kvm_cpuid_entry2 *cpuid_entries; + int limit, nent = 0, r = -E2BIG; + u32 func; + + if (cpuid->nent < 1) + goto out; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + cpuid->nent = KVM_MAX_CPUID_ENTRIES; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + if (!cpuid_entries) + goto out; + + do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); + limit = cpuid_entries[0].eax; + for (func = 1; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); + limit = cpuid_entries[nent - 1].eax; + for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + + + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, + cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, + cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + r = -EFAULT; + if (copy_to_user(entries, cpuid_entries, + nent * sizeof(struct kvm_cpuid_entry2))) + goto out_free; + cpuid->nent = nent; + r = 0; + +out_free: + vfree(cpuid_entries); +out: + return r; +} + +static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); + + return 0; +} + +static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); + kvm_apic_post_state_restore(vcpu); + update_cr8_intercept(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) +{ + if (irq->irq < 0 || irq->irq >= 256) + return -EINVAL; + if (irqchip_in_kernel(vcpu->kvm)) + return -ENXIO; + + kvm_queue_interrupt(vcpu, irq->irq, false); + kvm_make_request(KVM_REQ_EVENT, vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) +{ + kvm_inject_nmi(vcpu); + + return 0; +} + +static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, + struct kvm_tpr_access_ctl *tac) +{ + if (tac->flags) + return -EINVAL; + vcpu->arch.tpr_access_reporting = !!tac->enabled; + return 0; +} + +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, + u64 mcg_cap) +{ + int r; + unsigned bank_num = mcg_cap & 0xff, bank; + + r = -EINVAL; + if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) + goto out; + if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + goto out; + r = 0; + vcpu->arch.mcg_cap = mcg_cap; + /* Init IA32_MCG_CTL to all 1s */ + if (mcg_cap & MCG_CTL_P) + vcpu->arch.mcg_ctl = ~(u64)0; + /* Init IA32_MCi_CTL to all 1s */ + for (bank = 0; bank < bank_num; bank++) + vcpu->arch.mce_banks[bank*4] = ~(u64)0; +out: + return r; +} + +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, + struct kvm_x86_mce *mce) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + u64 *banks = vcpu->arch.mce_banks; + + if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) + return -EINVAL; + /* + * if IA32_MCG_CTL is not all 1s, the uncorrected error + * reporting is disabled + */ + if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && + vcpu->arch.mcg_ctl != ~(u64)0) + return 0; + banks += 4 * mce->bank; + /* + * if IA32_MCi_CTL is not all 1s, the uncorrected error + * reporting is disabled for the bank + */ + if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) + return 0; + if (mce->status & MCI_STATUS_UC) { + if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || + !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { + printk(KERN_DEBUG "kvm: set_mce: " + "injects mce exception while " + "previous one is in progress!\n"); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return 0; + } + if (banks[1] & MCI_STATUS_VAL) + mce->status |= MCI_STATUS_OVER; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + banks[1] = mce->status; + kvm_queue_exception(vcpu, MC_VECTOR); + } else if (!(banks[1] & MCI_STATUS_VAL) + || !(banks[1] & MCI_STATUS_UC)) { + if (banks[1] & MCI_STATUS_VAL) + mce->status |= MCI_STATUS_OVER; + banks[2] = mce->addr; + banks[3] = mce->misc; + banks[1] = mce->status; + } else + banks[1] |= MCI_STATUS_OVER; + return 0; +} + +static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + events->exception.injected = + vcpu->arch.exception.pending && + !kvm_exception_is_soft(vcpu->arch.exception.nr); + events->exception.nr = vcpu->arch.exception.nr; + events->exception.has_error_code = vcpu->arch.exception.has_error_code; + events->exception.pad = 0; + events->exception.error_code = vcpu->arch.exception.error_code; + + events->interrupt.injected = + vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; + events->interrupt.nr = vcpu->arch.interrupt.nr; + events->interrupt.soft = 0; + events->interrupt.shadow = + kvm_x86_ops->get_interrupt_shadow(vcpu, + KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); + + events->nmi.injected = vcpu->arch.nmi_injected; + events->nmi.pending = vcpu->arch.nmi_pending; + events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + events->nmi.pad = 0; + + events->sipi_vector = vcpu->arch.sipi_vector; + + events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING + | KVM_VCPUEVENT_VALID_SIPI_VECTOR + | KVM_VCPUEVENT_VALID_SHADOW); + memset(&events->reserved, 0, sizeof(events->reserved)); +} + +static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING + | KVM_VCPUEVENT_VALID_SIPI_VECTOR + | KVM_VCPUEVENT_VALID_SHADOW)) + return -EINVAL; + + vcpu->arch.exception.pending = events->exception.injected; + vcpu->arch.exception.nr = events->exception.nr; + vcpu->arch.exception.has_error_code = events->exception.has_error_code; + vcpu->arch.exception.error_code = events->exception.error_code; + + vcpu->arch.interrupt.pending = events->interrupt.injected; + vcpu->arch.interrupt.nr = events->interrupt.nr; + vcpu->arch.interrupt.soft = events->interrupt.soft; + if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); + if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) + kvm_x86_ops->set_interrupt_shadow(vcpu, + events->interrupt.shadow); + + vcpu->arch.nmi_injected = events->nmi.injected; + if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) + vcpu->arch.nmi_pending = events->nmi.pending; + kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) + vcpu->arch.sipi_vector = events->sipi_vector; + + kvm_make_request(KVM_REQ_EVENT, vcpu); + + return 0; +} + +static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) +{ + memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); + dbgregs->dr6 = vcpu->arch.dr6; + dbgregs->dr7 = vcpu->arch.dr7; + dbgregs->flags = 0; + memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); +} + +static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) +{ + if (dbgregs->flags) + return -EINVAL; + + memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); + vcpu->arch.dr6 = dbgregs->dr6; + vcpu->arch.dr7 = dbgregs->dr7; + + return 0; +} + +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + if (kvm_cpu_has_xsave) + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->xsave, + kvm_xstate_size); + else { + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->fxsave, + sizeof(struct kvm_i387_fxsave_struct)); + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = + XSTATE_FPSSE; + } +} + +static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + u64 xstate_bv = + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + + if (kvm_cpu_has_xsave) + memcpy(&vcpu->arch.guest_fpu.state->xsave, + guest_xsave->region, kvm_xstate_size); + else { + if (xstate_bv & ~XSTATE_FPSSE) + return -EINVAL; + memcpy(&vcpu->arch.guest_fpu.state->fxsave, + guest_xsave->region, sizeof(struct kvm_i387_fxsave_struct)); + } + return 0; +} + +static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + if (!kvm_cpu_has_xsave) { + guest_xcrs->nr_xcrs = 0; + return; + } + + guest_xcrs->nr_xcrs = 1; + guest_xcrs->flags = 0; + guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; + guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; +} + +static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + int i, r = 0; + + if (!kvm_cpu_has_xsave) + return -EINVAL; + + if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) + return -EINVAL; + + for (i = 0; i < guest_xcrs->nr_xcrs; i++) + /* Only support XCR0 currently */ + if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { + r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, + guest_xcrs->xcrs[0].value); + break; + } + if (r) + r = -EINVAL; + return r; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void *argp = (void *)arg; + int r; + union { + struct kvm_lapic_state *lapic; + struct kvm_xsave *xsave; + struct kvm_xcrs *xcrs; + void *buffer; + } u; + + u.buffer = NULL; + switch (ioctl) { + case KVM_GET_LAPIC: { + r = -EINVAL; + if (!vcpu->arch.apic) + goto out; + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + + r = -ENOMEM; + if (!u.lapic) + goto out; + r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) + goto out; + r = 0; + break; + } + case KVM_SET_LAPIC: { + r = -EINVAL; + if (!vcpu->arch.apic) + goto out; + u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + r = -ENOMEM; + if (!u.lapic) + goto out; + r = -EFAULT; + if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) + goto out; + r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); + if (r) + goto out; + r = 0; + break; + } + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + + r = -EFAULT; + if (copy_from_user(&irq, argp, sizeof irq)) + goto out; + r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); + if (r) + goto out; + r = 0; + break; + } + case KVM_NMI: { + r = kvm_vcpu_ioctl_nmi(vcpu); + if (r) + goto out; + r = 0; + break; + } + case KVM_SET_CPUID: { + struct kvm_cpuid *cpuid_arg = argp; + struct kvm_cpuid cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + goto out; + break; + } + case KVM_SET_CPUID2: { + struct kvm_cpuid2 *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + break; + } + case KVM_GET_CPUID2: { + struct kvm_cpuid2 *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + goto out; + r = 0; + break; + } + case KVM_GET_MSRS: + r = msr_io(vcpu, argp, kvm_get_msr, 1); + break; + case KVM_SET_MSRS: + r = msr_io(vcpu, argp, do_set_msr, 0); + break; + case KVM_TPR_ACCESS_REPORTING: { + struct kvm_tpr_access_ctl tac; + + r = -EFAULT; + if (copy_from_user(&tac, argp, sizeof tac)) + goto out; + r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &tac, sizeof tac)) + goto out; + r = 0; + break; + }; + case KVM_SET_VAPIC_ADDR: { + struct kvm_vapic_addr va; + + r = -EINVAL; + if (!irqchip_in_kernel(vcpu->kvm)) + goto out; + r = -EFAULT; + if (copy_from_user(&va, argp, sizeof va)) + goto out; + r = 0; + kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); + break; + } + case KVM_X86_SETUP_MCE: { + u64 mcg_cap; + + r = -EFAULT; + if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) + goto out; + r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); + break; + } + case KVM_X86_SET_MCE: { + struct kvm_x86_mce mce; + + r = -EFAULT; + if (copy_from_user(&mce, argp, sizeof mce)) + goto out; + r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); + break; + } + case KVM_GET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); + + r = -EFAULT; + if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) + break; + r = 0; + break; + } + case KVM_SET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + r = -EFAULT; + if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) + break; + + r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); + break; + } + case KVM_GET_DEBUGREGS: { + struct kvm_debugregs dbgregs; + + kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); + + r = -EFAULT; + if (copy_to_user(argp, &dbgregs, + sizeof(struct kvm_debugregs))) + break; + r = 0; + break; + } + case KVM_SET_DEBUGREGS: { + struct kvm_debugregs dbgregs; + + r = -EFAULT; + if (copy_from_user(&dbgregs, argp, + sizeof(struct kvm_debugregs))) + break; + + r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); + break; + } + case KVM_GET_XSAVE: { + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!u.xsave) + break; + + kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); + + r = -EFAULT; + if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) + break; + r = 0; + break; + } + case KVM_SET_XSAVE: { + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!u.xsave) + break; + + r = -EFAULT; + if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) + break; + + r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); + break; + } + case KVM_GET_XCRS: { + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!u.xcrs) + break; + + kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); + + r = -EFAULT; + if (copy_to_user(argp, u.xcrs, + sizeof(struct kvm_xcrs))) + break; + r = 0; + break; + } + case KVM_SET_XCRS: { + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!u.xcrs) + break; + + r = -EFAULT; + if (copy_from_user(u.xcrs, argp, + sizeof(struct kvm_xcrs))) + break; + + r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); + break; + } + default: + r = -EINVAL; + } +out: + kfree(u.buffer); + return r; +} + +static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) +{ + int ret; + + if (addr > (unsigned int)(-3 * PAGE_SIZE)) + return -1; + ret = kvm_x86_ops->set_tss_addr(kvm, addr); + return ret; +} + +static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, + u64 ident_addr) +{ + kvm->arch.ept_identity_map_addr = ident_addr; + return 0; +} + +static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, + u32 kvm_nr_mmu_pages) +{ + if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) + return -EINVAL; + + mutex_lock(&kvm->slots_lock); + spin_lock(&kvm->mmu_lock); + + kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); + kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; + + spin_unlock(&kvm->mmu_lock); + mutex_unlock(&kvm->slots_lock); + return 0; +} + +static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) +{ + return kvm->arch.n_max_mmu_pages; +} + +static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy(&chip->chip.pic, + &pic_irqchip(kvm)->pics[0], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy(&chip->chip.pic, + &pic_irqchip(kvm)->pics[1], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + r = kvm_get_ioapic(kvm, &chip->chip.ioapic); + break; + default: + r = -EINVAL; + break; + } + return r; +} + +static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + spin_lock(&pic_irqchip(kvm)->lock); + memcpy(&pic_irqchip(kvm)->pics[0], + &chip->chip.pic, + sizeof(struct kvm_pic_state)); + spin_unlock(&pic_irqchip(kvm)->lock); + break; + case KVM_IRQCHIP_PIC_SLAVE: + spin_lock(&pic_irqchip(kvm)->lock); + memcpy(&pic_irqchip(kvm)->pics[1], + &chip->chip.pic, + sizeof(struct kvm_pic_state)); + spin_unlock(&pic_irqchip(kvm)->lock); + break; + case KVM_IRQCHIP_IOAPIC: + r = kvm_set_ioapic(kvm, &chip->chip.ioapic); + break; + default: + r = -EINVAL; + break; + } + kvm_pic_update_irq(pic_irqchip(kvm)); + return r; +} + +static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) +{ + int r = 0; + + mutex_lock(&kvm->arch.vpit->pit_state.lock); + memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) +{ + int r = 0; + + mutex_lock(&kvm->arch.vpit->pit_state.lock); + memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); + kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int r = 0; + + mutex_lock(&kvm->arch.vpit->pit_state.lock); + memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, + sizeof(ps->channels)); + ps->flags = kvm->arch.vpit->pit_state.flags; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + memset(&ps->reserved, 0, sizeof(ps->reserved)); + return r; +} + +static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int r = 0, start = 0; + u32 prev_legacy, cur_legacy; + mutex_lock(&kvm->arch.vpit->pit_state.lock); + prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; + if (!prev_legacy && cur_legacy) + start = 1; + memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, + sizeof(kvm->arch.vpit->pit_state.channels)); + kvm->arch.vpit->pit_state.flags = ps->flags; + kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_reinject(struct kvm *kvm, + struct kvm_reinject_control *control) +{ + if (!kvm->arch.vpit) + return -ENXIO; + mutex_lock(&kvm->arch.vpit->pit_state.lock); + kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return 0; +} + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + int r, i; + struct kvm_memory_slot *memslot; + unsigned long n; + unsigned long is_dirty = 0; + + mutex_lock(&kvm->slots_lock); + + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + for (i = 0; !is_dirty && i < n/sizeof(long); i++) + is_dirty = memslot->dirty_bitmap[i]; + + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + unsigned long *dirty_bitmap; + + dirty_bitmap = memslot->dirty_bitmap_head; + if (memslot->dirty_bitmap == dirty_bitmap) + dirty_bitmap += n / sizeof(long); + memset(dirty_bitmap, 0, n); + + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + slots->generation++; + + old_slots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + kvm_synchronize_srcu_expedited(&kvm->srcu); + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kfree(old_slots); + + spin_lock(&kvm->mmu_lock); + kvm_mmu_slot_remove_write_access(kvm, log->slot); + spin_unlock(&kvm->mmu_lock); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + goto out; + } else { + r = -EFAULT; + if (clear_user(log->dirty_bitmap, n)) + goto out; + } + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void *argp = (void *)arg; + int r = -ENOTTY; + /* + * This union makes it completely explicit to gcc-3.x + * that these two variables' stack usage should be + * combined, not added together. + */ + union { + struct kvm_pit_state ps; + struct kvm_pit_state2 ps2; + struct kvm_pit_config pit_config; + } u; + + switch (ioctl) { + case KVM_SET_TSS_ADDR: + r = kvm_vm_ioctl_set_tss_addr(kvm, arg); + if (r < 0) + goto out; + break; + case KVM_SET_IDENTITY_MAP_ADDR: { + u64 ident_addr; + + r = -EFAULT; + if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) + goto out; + r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); + if (r < 0) + goto out; + break; + } + case KVM_SET_NR_MMU_PAGES: + r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); + if (r) + goto out; + break; + case KVM_GET_NR_MMU_PAGES: + r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); + break; + case KVM_CREATE_IRQCHIP: { + struct kvm_pic *vpic; + + mutex_lock(&kvm->lock); + r = -EEXIST; + if (kvm->arch.vpic) + goto create_irqchip_unlock; + r = -ENOMEM; + vpic = kvm_create_pic(kvm); + if (vpic) { + r = kvm_ioapic_init(kvm); + if (r) { + mutex_lock(&kvm->slots_lock); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev); + mutex_unlock(&kvm->slots_lock); + kfree(vpic); + goto create_irqchip_unlock; + } + } else + goto create_irqchip_unlock; + smp_wmb(); + kvm->arch.vpic = vpic; + smp_wmb(); + r = kvm_setup_default_irq_routing(kvm); + if (r) { + mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->irq_lock); + kvm_ioapic_destroy(kvm); + kvm_destroy_pic(kvm); + mutex_unlock(&kvm->irq_lock); + mutex_unlock(&kvm->slots_lock); + } + create_irqchip_unlock: + mutex_unlock(&kvm->lock); + break; + } + case KVM_CREATE_PIT: + u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; + goto create_pit; + case KVM_CREATE_PIT2: + r = -EFAULT; + if (copy_from_user(&u.pit_config, argp, + sizeof(struct kvm_pit_config))) + goto out; + create_pit: + mutex_lock(&kvm->slots_lock); + r = -EEXIST; + if (kvm->arch.vpit) + goto create_pit_unlock; + r = -ENOMEM; + kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); + if (kvm->arch.vpit) + r = 0; + create_pit_unlock: + mutex_unlock(&kvm->slots_lock); + break; + case KVM_IRQ_LINE_STATUS: + case KVM_IRQ_LINE: { + struct kvm_irq_level irq_event; + + r = -EFAULT; + if (copy_from_user(&irq_event, argp, sizeof irq_event)) + goto out; + r = -ENXIO; + if (irqchip_in_kernel(kvm)) { + __s32 status; + status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event.irq, irq_event.level); + if (ioctl == KVM_IRQ_LINE_STATUS) { + r = -EFAULT; + irq_event.status = status; + if (copy_to_user(argp, &irq_event, + sizeof irq_event)) + goto out; + } + r = 0; + } + break; + } + case KVM_GET_IRQCHIP: { + /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ + struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); + + r = -ENOMEM; + if (!chip) + goto out; + r = -EFAULT; + if (copy_from_user(chip, argp, sizeof *chip)) + goto get_irqchip_out; + r = -ENXIO; + if (!irqchip_in_kernel(kvm)) + goto get_irqchip_out; + r = kvm_vm_ioctl_get_irqchip(kvm, chip); + if (r) + goto get_irqchip_out; + r = -EFAULT; + if (copy_to_user(argp, chip, sizeof *chip)) + goto get_irqchip_out; + r = 0; + get_irqchip_out: + kfree(chip); + if (r) + goto out; + break; + } + case KVM_SET_IRQCHIP: { + /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ + struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); + + r = -ENOMEM; + if (!chip) + goto out; + r = -EFAULT; + if (copy_from_user(chip, argp, sizeof *chip)) + goto set_irqchip_out; + r = -ENXIO; + if (!irqchip_in_kernel(kvm)) + goto set_irqchip_out; + r = kvm_vm_ioctl_set_irqchip(kvm, chip); + if (r) + goto set_irqchip_out; + r = 0; + set_irqchip_out: + kfree(chip); + if (r) + goto out; + break; + } + case KVM_GET_PIT: { + r = -EFAULT; + if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) + goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_get_pit(kvm, &u.ps); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) + goto out; + r = 0; + break; + } + case KVM_SET_PIT: { + r = -EFAULT; + if (copy_from_user(&u.ps, argp, sizeof u.ps)) + goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_set_pit(kvm, &u.ps); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_PIT2: { + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) + goto out; + r = 0; + break; + } + case KVM_SET_PIT2: { + r = -EFAULT; + if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) + goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); + if (r) + goto out; + r = 0; + break; + } + case KVM_REINJECT_CONTROL: { + struct kvm_reinject_control control; + r = -EFAULT; + if (copy_from_user(&control, argp, sizeof(control))) + goto out; + r = kvm_vm_ioctl_reinject(kvm, &control); + if (r) + goto out; + r = 0; + break; + } + case KVM_XEN_HVM_CONFIG: { + r = -EFAULT; + if (copy_from_user(&kvm->arch.xen_hvm_config, argp, + sizeof(struct kvm_xen_hvm_config))) + goto out; + r = -EINVAL; + if (kvm->arch.xen_hvm_config.flags) + goto out; + r = 0; + break; + } + case KVM_SET_CLOCK: { + struct kvm_clock_data user_ns; + u64 now_ns; + s64 delta; + + r = -EFAULT; + if (copy_from_user(&user_ns, argp, sizeof(user_ns))) + goto out; + + r = -EINVAL; + if (user_ns.flags) + goto out; + + r = 0; + local_irq_disable(); + now_ns = get_kernel_ns(); + delta = user_ns.clock - now_ns; + local_irq_enable(); + kvm->arch.kvmclock_offset = delta; + break; + } + case KVM_GET_CLOCK: { + struct kvm_clock_data user_ns; + u64 now_ns; + + local_irq_disable(); + now_ns = get_kernel_ns(); + user_ns.clock = kvm->arch.kvmclock_offset + now_ns; + local_irq_enable(); + user_ns.flags = 0; + memset(&user_ns.pad, 0, sizeof(user_ns.pad)); + + r = -EFAULT; + if (copy_to_user(argp, &user_ns, sizeof(user_ns))) + goto out; + r = 0; + break; + } + + default: + ; + } +out: + return r; +} + +static void kvm_init_msr_list(void) +{ + u32 dummy[2]; + unsigned i, j; + + /* skip the first msrs in the list. KVM-specific */ + for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { + if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + continue; + if (j < i) + msrs_to_save[j] = msrs_to_save[i]; + j++; + } + num_msrs_to_save = j; +} + +static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, + const void *v) +{ + if (vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) + return 0; + + return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); +} + +static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) +{ + if (vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) + return 0; + + return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); +} + +static void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->set_segment(vcpu, var, seg); +} + +void kvm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->get_segment(vcpu, var, seg); +} + +static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + return gpa; +} + +static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + gpa_t t_gpa; + struct x86_exception exception; + + BUG_ON(!mmu_is_nested(vcpu)); + + /* NPT walks are always user-walks */ + access |= PFERR_USER_MASK; + t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); + + return t_gpa; +} + +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_FETCH_MASK; + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception) +{ + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + struct x86_exception *exception) +{ + void *data = val; + int r = X86EMUL_CONTINUE; + + while (bytes) { + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, + exception); + unsigned offset = addr & (PAGE_SIZE-1); + unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); + if (ret < 0) { + r = X86EMUL_IO_NEEDED; + goto out; + } + + bytes -= toread; + data += toread; + addr += toread; + } +out: + return r; +} + +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, + struct x86_exception *exception) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, + exception); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, + struct x86_exception *exception) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + exception); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, + struct x86_exception *exception) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); +} + +static int kvm_write_guest_virt_system(gva_t addr, void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu, + struct x86_exception *exception) +{ + void *data = val; + int r = X86EMUL_CONTINUE; + + while (bytes) { + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, + PFERR_WRITE_MASK, + exception); + unsigned offset = addr & (PAGE_SIZE-1); + unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); + if (ret < 0) { + r = X86EMUL_IO_NEEDED; + goto out; + } + + bytes -= towrite; + data += towrite; + addr += towrite; + } +out: + return r; +} + +static int emulator_read_emulated(unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, + vcpu->mmio_phys_addr, *(u64 *)val); + vcpu->mmio_read_completed = 0; + return X86EMUL_CONTINUE; + } + + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); + + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + + if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) + == X86EMUL_CONTINUE) + return X86EMUL_CONTINUE; + +mmio: + /* + * Is this MMIO handled locally? + */ + if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); + return X86EMUL_CONTINUE; + } + + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); + + vcpu->mmio_needed = 1; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; + vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; + + return X86EMUL_IO_NEEDED; +} + +int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) +{ + int ret; + + ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); + if (ret < 0) + return 0; + kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); + return 1; +} + +static int emulator_write_emulated_onepage(unsigned long addr, + const void *val, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); + + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + + if (emulator_write_phys(vcpu, gpa, val, bytes)) + return X86EMUL_CONTINUE; + +mmio: + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + /* + * Is this MMIO handled locally? + */ + if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + return X86EMUL_CONTINUE; + + vcpu->mmio_needed = 1; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; + vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; + memcpy(vcpu->run->mmio.data, val, bytes); + + return X86EMUL_CONTINUE; +} + +int emulator_write_emulated(unsigned long addr, + const void *val, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu) +{ + /* Crossing a page boundary? */ + if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { + int rc, now; + + now = -addr & ~PAGE_MASK; + rc = emulator_write_emulated_onepage(addr, val, now, exception, + vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + addr += now; + val += now; + bytes -= now; + } + return emulator_write_emulated_onepage(addr, val, bytes, exception, + vcpu); +} + +#define CMPXCHG_TYPE(t, ptr, old, new) \ + (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) + +#ifdef CONFIG_X86_64 +# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) +#else +# define CMPXCHG64(ptr, old, new) \ + (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) +#endif + +static int emulator_cmpxchg_emulated(unsigned long addr, + const void *old, + const void *new, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + struct page *page; + char *kaddr; + bool exchanged; + + /* guests cmpxchg8b have to be emulated atomically */ + if (bytes > 8 || (bytes & (bytes - 1))) + goto emul_write; + + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); + + if (gpa == UNMAPPED_GVA || + (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto emul_write; + + if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) + goto emul_write; + + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (is_error_page(page)) { + kvm_release_page_clean(page); + goto emul_write; + } + + kaddr = kmap_atomic(page, KM_USER0); + kaddr += offset_in_page(gpa); + switch (bytes) { + case 1: + exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); + break; + case 2: + exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); + break; + case 4: + exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); + break; + case 8: + exchanged = CMPXCHG64(kaddr, old, new); + break; + default: + BUG(); + } + kunmap_atomic(kaddr, KM_USER0); + kvm_release_page_dirty(page); + + if (!exchanged) + return X86EMUL_CMPXCHG_FAILED; + + kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); + + return X86EMUL_CONTINUE; + +emul_write: + printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); + + return emulator_write_emulated(addr, new, bytes, exception, vcpu); +} + +static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +{ + /* TODO: String I/O for in kernel device */ + int r; + + if (vcpu->arch.pio.in) + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + else + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); + return r; +} + + +static int emulator_pio_in_emulated(int size, unsigned short port, void *val, + unsigned int count, struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.pio.count) + goto data_avail; + + trace_kvm_pio(0, port, size, count); + + vcpu->arch.pio.port = port; + vcpu->arch.pio.in = 1; + vcpu->arch.pio.count = count; + vcpu->arch.pio.size = size; + + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + data_avail: + memcpy(val, vcpu->arch.pio_data, size * count); + vcpu->arch.pio.count = 0; + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = KVM_EXIT_IO_IN; + vcpu->run->io.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = count; + vcpu->run->io.port = port; + + return 0; +} + +static int emulator_pio_out_emulated(int size, unsigned short port, + const void *val, unsigned int count, + struct kvm_vcpu *vcpu) +{ + trace_kvm_pio(1, port, size, count); + + vcpu->arch.pio.port = port; + vcpu->arch.pio.in = 0; + vcpu->arch.pio.count = count; + vcpu->arch.pio.size = size; + + memcpy(vcpu->arch.pio_data, val, size * count); + + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + vcpu->arch.pio.count = 0; + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = KVM_EXIT_IO_OUT; + vcpu->run->io.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = count; + vcpu->run->io.port = port; + + return 0; +} + +static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + return kvm_x86_ops->get_segment_base(vcpu, seg); +} + +int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +{ + kvm_mmu_invlpg(vcpu, address); + return X86EMUL_CONTINUE; +} + +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ + if (!need_emulate_wbinvd(vcpu)) + return X86EMUL_CONTINUE; + + if (kvm_x86_ops->has_wbinvd_exit()) { + int cpu = get_cpu(); + + cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); + smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + wbinvd_ipi, NULL, 1); + put_cpu(); + cpumask_clear(vcpu->arch.wbinvd_dirty_mask); + } else + wbinvd(); + return X86EMUL_CONTINUE; +} +EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); + +int emulate_clts(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + kvm_x86_ops->fpu_activate(vcpu); + return X86EMUL_CONTINUE; +} + +int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) +{ + return _kvm_get_dr(vcpu, dr, dest); +} + +int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) +{ + + return __kvm_set_dr(vcpu, dr, value); +} + +static u64 mk_cr_64(u64 curr_cr, u32 new_val) +{ + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; +} + +static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) +{ + unsigned long value; + + switch (cr) { + case 0: + value = kvm_read_cr0(vcpu); + break; + case 2: + value = vcpu->arch.cr2; + break; + case 3: + value = kvm_read_cr3(vcpu); + break; + case 4: + value = kvm_read_cr4(vcpu); + break; + case 8: + value = kvm_get_cr8(vcpu); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + return 0; + } + + return value; +} + +static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +{ + int res = 0; + + switch (cr) { + case 0: + res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + break; + case 2: + vcpu->arch.cr2 = val; + break; + case 3: + res = kvm_set_cr3(vcpu, val); + break; + case 4: + res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + break; + case 8: + res = kvm_set_cr8(vcpu, val); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + res = -1; + } + + return res; +} + +static int emulator_get_cpl(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->get_cpl(vcpu); +} + +static void emulator_get_gdt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->get_gdt(vcpu, dt); +} + +static void emulator_get_idt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->get_idt(vcpu, dt); +} + +static unsigned long emulator_get_cached_segment_base(int seg, + struct kvm_vcpu *vcpu) +{ + return get_segment_base(vcpu, seg); +} + +static bool emulator_get_cached_descriptor(struct kvm_desc_struct *desc, int seg, + struct kvm_vcpu *vcpu) +{ + struct kvm_segment var; + + kvm_get_segment(vcpu, &var, seg); + + if (var.unusable) + return false; + + if (var.g) + var.limit >>= 12; + kvm_set_desc_limit(desc, var.limit); + kvm_set_desc_base(desc, (unsigned long)var.base); + desc->type = var.type; + desc->s = var.s; + desc->dpl = var.dpl; + desc->p = var.present; + desc->avl = var.avl; + desc->l = var.l; + desc->d = var.db; + desc->g = var.g; + + return true; +} + +static void emulator_set_cached_descriptor(struct kvm_desc_struct *desc, int seg, + struct kvm_vcpu *vcpu) +{ + struct kvm_segment var; + + /* needed to preserve selector */ + kvm_get_segment(vcpu, &var, seg); + + var.base = kvm_get_desc_base(desc); + var.limit = kvm_get_desc_limit(desc); + if (desc->g) + var.limit = (var.limit << 12) | 0xfff; + var.type = desc->type; + var.present = desc->p; + var.dpl = desc->dpl; + var.db = desc->d; + var.s = desc->s; + var.l = desc->l; + var.g = desc->g; + var.avl = desc->avl; + var.present = desc->p; + var.unusable = !var.present; + var.padding = 0; + + kvm_set_segment(vcpu, &var, seg); + return; +} + +static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) +{ + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + return kvm_seg.selector; +} + +static void emulator_set_segment_selector(u16 sel, int seg, + struct kvm_vcpu *vcpu) +{ + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + +static struct x86_emulate_ops emulate_ops = { + .read_std = kvm_read_guest_virt_system, + .write_std = kvm_write_guest_virt_system, + .fetch = kvm_fetch_guest_virt, + .read_emulated = emulator_read_emulated, + .write_emulated = emulator_write_emulated, + .cmpxchg_emulated = emulator_cmpxchg_emulated, + .pio_in_emulated = emulator_pio_in_emulated, + .pio_out_emulated = emulator_pio_out_emulated, + .get_cached_descriptor = emulator_get_cached_descriptor, + .set_cached_descriptor = emulator_set_cached_descriptor, + .get_segment_selector = emulator_get_segment_selector, + .set_segment_selector = emulator_set_segment_selector, + .get_cached_segment_base = emulator_get_cached_segment_base, + .get_gdt = emulator_get_gdt, + .get_idt = emulator_get_idt, + .get_cr = emulator_get_cr, + .set_cr = emulator_set_cr, + .cpl = emulator_get_cpl, + .get_dr = emulator_get_dr, + .set_dr = emulator_set_dr, + .set_msr = kvm_set_msr, + .get_msr = kvm_get_msr, +}; + +static void cache_all_regs(struct kvm_vcpu *vcpu) +{ + kvm_register_read(vcpu, VCPU_REGS_RAX); + kvm_register_read(vcpu, VCPU_REGS_RSP); + kvm_register_read(vcpu, VCPU_REGS_RIP); + vcpu->arch.regs_dirty = ~0; +} + +static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + kvm_x86_ops->set_interrupt_shadow(vcpu, mask); +} + +static void inject_emulated_exception(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + if (ctxt->exception.vector == PF_VECTOR) + kvm_propagate_fault(vcpu, &ctxt->exception); + else if (ctxt->exception.error_code_valid) + kvm_queue_exception_e(vcpu, ctxt->exception.vector, + ctxt->exception.error_code); + else + kvm_queue_exception(vcpu, ctxt->exception.vector); +} + +static void init_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int cs_db, cs_l; + + cache_all_regs(vcpu); + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); + vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_VM86 : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); +} + +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int ret; + + init_emulate_ctxt(vcpu); + + vcpu->arch.emulate_ctxt.decode.op_bytes = 2; + vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; + vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; + ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); + + if (ret != X86EMUL_CONTINUE) + return EMULATE_FAIL; + + vcpu->arch.emulate_ctxt.eip = c->eip; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (irq == NMI_VECTOR) + vcpu->arch.nmi_pending = false; + else + vcpu->arch.interrupt.pending = false; + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); + +static int handle_emulation_failure(struct kvm_vcpu *vcpu) +{ + int r = EMULATE_DONE; + + ++vcpu->stat.insn_emulation_fail; + trace_kvm_emulate_insn_failed(vcpu); + if (!is_guest_mode(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + r = EMULATE_FAIL; + } + kvm_queue_exception(vcpu, UD_VECTOR); + + return r; +} + +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + + if (tdp_enabled) + return false; + + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-entetr the + * guest to let CPU execute the instruction. + */ + if (kvm_mmu_unprotect_page_virt(vcpu, gva)) + return true; + + gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + + if (gpa == UNMAPPED_GVA) + return true; /* let cpu generate fault */ + + if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) + return true; + + return false; +} + +int x86_emulate_instruction(struct kvm_vcpu *vcpu, + unsigned long cr2, + int emulation_type, + void *insn, + int insn_len) +{ + int r; + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + + kvm_clear_exception_queue(vcpu); + vcpu->arch.mmio_fault_cr2 = cr2; + /* + * TODO: fix emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ + cache_all_regs(vcpu); + + if (!(emulation_type & EMULTYPE_NO_DECODE)) { + init_emulate_ctxt(vcpu); + vcpu->arch.emulate_ctxt.interruptibility = 0; + vcpu->arch.emulate_ctxt.have_exception = false; + vcpu->arch.emulate_ctxt.perm_ok = false; + + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); + if (r == X86EMUL_PROPAGATE_FAULT) + goto done; + + trace_kvm_emulate_insn_start(vcpu); + + /* Only allow emulation of specific instructions on #UD + * (namely VMMCALL, sysenter, sysexit, syscall)*/ + if (emulation_type & EMULTYPE_TRAP_UD) { + if (!c->twobyte) + return EMULATE_FAIL; + switch (c->b) { + case 0x01: /* VMMCALL */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return EMULATE_FAIL; + break; + case 0x34: /* sysenter */ + case 0x35: /* sysexit */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return EMULATE_FAIL; + break; + case 0x05: /* syscall */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return EMULATE_FAIL; + break; + default: + return EMULATE_FAIL; + } + + if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) + return EMULATE_FAIL; + } + + ++vcpu->stat.insn_emulation; + if (r) { + if (reexecute_instruction(vcpu, cr2)) + return EMULATE_DONE; + if (emulation_type & EMULTYPE_SKIP) + return EMULATE_FAIL; + return handle_emulation_failure(vcpu); + } + } + + if (emulation_type & EMULTYPE_SKIP) { + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); + return EMULATE_DONE; + } + + /* this is needed for vmware backdor interface to work since it + changes registers values during IO operation */ + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + +restart: + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); + + if (r == EMULATION_FAILED) { + if (reexecute_instruction(vcpu, cr2)) + return EMULATE_DONE; + + return handle_emulation_failure(vcpu); + } + +done: + if (vcpu->arch.emulate_ctxt.have_exception) { + inject_emulated_exception(vcpu); + r = EMULATE_DONE; + } else if (vcpu->arch.pio.count) { + if (!vcpu->arch.pio.in) + vcpu->arch.pio.count = 0; + r = EMULATE_DO_MMIO; + } else if (vcpu->mmio_needed) { + if (vcpu->mmio_is_write) + vcpu->mmio_needed = 0; + r = EMULATE_DO_MMIO; + } else if (r == EMULATION_RESTART) + goto restart; + else + r = EMULATE_DONE; + + toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + + return r; +} +EXPORT_SYMBOL_GPL(x86_emulate_instruction); + +int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) +{ + unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); + int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); + /* do not return to emulator after return from userspace */ + vcpu->arch.pio.count = 0; + return ret; +} +EXPORT_SYMBOL_GPL(kvm_fast_pio_out); + +static void tsc_bad(void *info) +{ + kvm___this_cpu_write(cpu_tsc_khz, 0); +} + +static void tsc_khz_changed(void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long khz = 0; + + if (data) + khz = freq->new; + else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + khz = cpufreq_quick_get(raw_smp_processor_id()); + if (!khz) + khz = tsc_khz; + kvm___this_cpu_write(cpu_tsc_khz, khz); +} + +static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i, send_ipi = 0; + + /* + * We allow guests to temporarily run on slowing clocks, + * provided we notify them after, or to run on accelerating + * clocks, provided we notify them before. Thus time never + * goes backwards. + * + * However, we have a problem. We can't atomically update + * the frequency of a given CPU from this function; it is + * merely a notifier, which can be called from any CPU. + * Changing the TSC frequency at arbitrary points in time + * requires a recomputation of local variables related to + * the TSC for each VCPU. We must flag these local variables + * to be updated and be sure the update takes place with the + * new frequency before any guests proceed. + * + * Unfortunately, the combination of hotplug CPU and frequency + * change creates an intractable locking scenario; the order + * of when these callouts happen is undefined with respect to + * CPU hotplug, and they can race with each other. As such, + * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is + * undefined; you can actually have a CPU frequency change take + * place in between the computation of X and the setting of the + * variable. To protect against this problem, all updates of + * the per_cpu tsc_khz variable are done in an interrupt + * protected IPI, and all callers wishing to update the value + * must wait for a synchronous IPI to complete (which is trivial + * if the caller is on the CPU already). This establishes the + * necessary total order on variable updates. + * + * Note that because a guest time update may take place + * anytime after the setting of the VCPU's request bit, the + * correct TSC value must be set before the request. However, + * to ensure the update actually makes it to any guest which + * starts running in hardware virtualization between the set + * and the acquisition of the spinlock, we must also ping the + * CPU after setting the request bit. + * + */ + + if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) + return 0; + if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) + return 0; + + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->cpu != freq->cpu) + continue; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + if (vcpu->cpu != smp_processor_id()) + send_ipi = 1; + } + } + spin_unlock(&kvm_lock); + + if (freq->old < freq->new && send_ipi) { + /* + * We upscale the frequency. Must make the guest + * doesn't see old kvmclock values while running with + * the new frequency, otherwise we risk the guest sees + * time go backwards. + * + * In case we update the frequency for another cpu + * (which might be in guest context) send an interrupt + * to kick the cpu out of guest context. Next time + * guest context is entered kvmclock will be updated, + * so the guest will not see stale values. + */ + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + } + return 0; +} + +static struct notifier_block kvmclock_cpufreq_notifier_block = { + .notifier_call = kvmclock_cpufreq_notifier +}; + +static int kvmclock_cpu_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, tsc_bad, NULL, 1); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block kvmclock_cpu_notifier_block = { + .notifier_call = kvmclock_cpu_notifier, + .priority = -INT_MAX +}; + +static void kvm_timer_init(void) +{ + int cpu; + + max_tsc_khz = tsc_khz; + register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { +#ifdef CONFIG_CPU_FREQ + struct cpufreq_policy policy; + memset(&policy, 0, sizeof(policy)); + cpu = get_cpu(); + cpufreq_get_policy(&policy, cpu); + if (policy.cpuinfo.max_freq) + max_tsc_khz = policy.cpuinfo.max_freq; + put_cpu(); +#endif + cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + } + pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); +} + +static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); + +static int kvm_is_in_guest(void) +{ + return percpu_read(current_vcpu) != NULL; +} + +static int kvm_is_user_mode(void) +{ + int user_mode = 3; + + if (percpu_read(current_vcpu)) + user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); + + return user_mode != 0; +} + +static unsigned long kvm_get_guest_ip(void) +{ + unsigned long ip = 0; + + if (percpu_read(current_vcpu)) + ip = kvm_rip_read(percpu_read(current_vcpu)); + + return ip; +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .is_in_guest = kvm_is_in_guest, + .is_user_mode = kvm_is_user_mode, + .get_guest_ip = kvm_get_guest_ip, +}; + +void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) +{ + percpu_write(current_vcpu, vcpu); +} +EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); + +void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) +{ + percpu_write(current_vcpu, NULL); +} +EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); + +int kvm_arch_init(void *opaque) +{ + int r; + struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + + if (kvm_x86_ops) { + printk(KERN_ERR "kvm: already loaded the other module\n"); + r = -EEXIST; + goto out; + } + + if (!ops->cpu_has_kvm_support()) { + printk(KERN_ERR "kvm: no hardware support\n"); + r = -EOPNOTSUPP; + goto out; + } + if (ops->disabled_by_bios()) { + printk(KERN_ERR "kvm: disabled by bios\n"); +#ifndef KVM_TBOOT_ENABLED_WORKS + + printk(KERN_ERR "kvm: if TXT is enabled in the BIOS, disable it\n"); +#endif + r = -EOPNOTSUPP; + goto out; + } + + r = kvm_mmu_module_init(); + if (r) + goto out; + + kvm_init_msr_list(); + + kvm_xstate_size_init(); + + kvm_x86_ops = ops; + kvm_mmu_set_nonpresent_ptes(0ull, 0ull); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, + PT_DIRTY_MASK, PT64_NX_MASK, 0); + + kvm_timer_init(); + + perf_register_guest_info_callbacks(&kvm_guest_cbs); + + if (kvm_cpu_has_xsave) + host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + + return 0; + +out: + return r; +} + +void kvm_arch_exit(void) +{ + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); + kvm_x86_ops = NULL; + kvm_mmu_module_exit(); +} + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.halt_exits; + if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + return 1; + } else { + vcpu->run->exit_reason = KVM_EXIT_HLT; + return 0; + } +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt); + +static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, + unsigned long a1) +{ + if (is_long_mode(vcpu)) + return a0; + else + return a0 | ((gpa_t)a1 << 32); +} + +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} + +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +{ + unsigned long nr, a0, a1, a2, a3, ret; + int r = 1; + + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); + + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); + a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); + a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); + a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); + a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); + + trace_kvm_hypercall(nr, a0, a1, a2, a3); + + if (!is_long_mode(vcpu)) { + nr &= 0xFFFFFFFF; + a0 &= 0xFFFFFFFF; + a1 &= 0xFFFFFFFF; + a2 &= 0xFFFFFFFF; + a3 &= 0xFFFFFFFF; + } + + if (kvm_x86_ops->get_cpl(vcpu) != 0) { + ret = -KVM_EPERM; + goto out; + } + + switch (nr) { + case KVM_HC_VAPIC_POLL_IRQ: + ret = 0; + break; + case KVM_HC_MMU_OP: + r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); + break; + default: + ret = -KVM_ENOSYS; + break; + } +out: + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + ++vcpu->stat.hypercalls; + return r; +} +EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); + +int kvm_fix_hypercall(struct kvm_vcpu *vcpu) +{ + char instruction[3]; + unsigned long rip = kvm_rip_read(vcpu); + + /* + * Blow out the MMU to ensure that no other VCPU has an active mapping + * to ensure that the updated hypercall appears atomically across all + * VCPUs. + */ + kvm_mmu_zap_all(vcpu->kvm); + + kvm_x86_ops->patch_hypercall(vcpu, instruction); + + return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); +} + +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct kvm_desc_ptr dt = { limit, base }; + + kvm_x86_ops->set_gdt(vcpu, &dt); +} + +void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct kvm_desc_ptr dt = { limit, base }; + + kvm_x86_ops->set_idt(vcpu, &dt); +} + +static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) +{ + struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; + int j, nent = vcpu->arch.cpuid_nent; + + e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; + /* when no next entry is found, the current entry[i] is reselected */ + for (j = i + 1; ; j = (j + 1) % nent) { + struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; + if (ej->function == e->function) { + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + return j; + } + } + return 0; /* silence gcc, even though control never reaches here */ +} + +/* find an entry with matching function, matching index (if needed), and that + * should be read next (if it's stateful) */ +static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, + u32 function, u32 index) +{ + if (e->function != function) + return 0; + if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) + return 0; + if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + return 0; + return 1; +} + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + int i; + struct kvm_cpuid_entry2 *best = NULL; + + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + struct kvm_cpuid_entry2 *e; + + e = &vcpu->arch.cpuid_entries[i]; + if (is_matching_cpuid_entry(e, function, index)) { + if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) + move_to_next_stateful_cpuid_entry(vcpu, i); + best = e; + break; + } + /* + * Both basic or both extended? + */ + if (((e->function ^ function) & 0x80000000) == 0) + if (!best || e->function > best->function) + best = e; + } + return best; +} +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); + +int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + if (!best || best->eax < 0x80000008) + goto not_found; + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best) + return best->eax & 0xff; +not_found: + return 36; +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 function, index; + struct kvm_cpuid_entry2 *best; + + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); + best = kvm_find_cpuid_entry(vcpu, function, index); + if (best) { + kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); + } + kvm_x86_ops->skip_emulated_instruction(vcpu); + trace_kvm_cpuid(function, + kvm_register_read(vcpu, VCPU_REGS_RAX), + kvm_register_read(vcpu, VCPU_REGS_RBX), + kvm_register_read(vcpu, VCPU_REGS_RCX), + kvm_register_read(vcpu, VCPU_REGS_RDX)); +} +EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); + +/* + * Check if userspace requested an interrupt window, and that the + * interrupt window is open. + * + * No need to exit to userspace if we already have an interrupt queued. + */ +static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) +{ + return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && + vcpu->run->request_interrupt_window && + kvm_arch_interrupt_allowed(vcpu)); +} + +static void post_kvm_run_save(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + + kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->cr8 = kvm_get_cr8(vcpu); + kvm_run->apic_base = kvm_get_apic_base(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = 1; + else + kvm_run->ready_for_interrupt_injection = + kvm_arch_interrupt_allowed(vcpu) && + !kvm_cpu_has_interrupt(vcpu) && + !kvm_event_needs_reinjection(vcpu); +} + +static void vapic_enter(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct page *page; + + if (!apic || !apic->vapic_addr) + return; + + page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); + + vcpu->arch.apic->vapic_page = page; +} + +static void vapic_exit(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int idx; + + if (!apic || !apic->vapic_addr) + return; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_release_page_dirty(apic->vapic_page); + mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); + srcu_read_unlock(&vcpu->kvm->srcu, idx); +} + +static void update_cr8_intercept(struct kvm_vcpu *vcpu) +{ + int max_irr, tpr; + + if (!kvm_x86_ops->update_cr8_intercept) + return; + + if (!vcpu->arch.apic) + return; + + if (!vcpu->arch.apic->vapic_addr) + max_irr = kvm_lapic_find_highest_irr(vcpu); + else + max_irr = -1; + + if (max_irr != -1) + max_irr >>= 4; + + tpr = kvm_lapic_get_cr8(vcpu); + + kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); +} + +static void inject_pending_event(struct kvm_vcpu *vcpu) +{ + /* try to reinject previous events if any */ + if (vcpu->arch.exception.pending) { + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code, + vcpu->arch.exception.reinject); + return; + } + + if (vcpu->arch.nmi_injected) { + kvm_x86_ops->set_nmi(vcpu); + return; + } + + if (vcpu->arch.interrupt.pending) { + kvm_x86_ops->set_irq(vcpu); + return; + } + + /* try to inject new event if pending */ + if (vcpu->arch.nmi_pending) { + if (kvm_x86_ops->nmi_allowed(vcpu)) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + kvm_x86_ops->set_nmi(vcpu); + } + } else if (kvm_cpu_has_interrupt(vcpu)) { + if (kvm_x86_ops->interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), + false); + kvm_x86_ops->set_irq(vcpu); + } + } +} + +static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && + !vcpu->guest_xcr0_loaded) { + /* kvm_set_xcr() also depends on this */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + vcpu->guest_xcr0_loaded = 1; + } +} + +static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_xcr0_loaded) { + if (vcpu->arch.xcr0 != host_xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); + vcpu->guest_xcr0_loaded = 0; + } +} + +static int vcpu_enter_guest(struct kvm_vcpu *vcpu) +{ + int r; + bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && + vcpu->run->request_interrupt_window; + + if (vcpu->requests) { + if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) + kvm_mmu_unload(vcpu); + if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) + __kvm_migrate_timers(vcpu); + if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { + r = kvm_guest_time_update(vcpu); + if (unlikely(r)) + goto out; + } + if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) + kvm_mmu_sync_roots(vcpu); + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvm_x86_ops->tlb_flush(vcpu); + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; + r = 0; + goto out; + } + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + r = 0; + goto out; + } + if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } + if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { + /* Page is swapped out. Do synthetic halt */ + vcpu->arch.apf.halted = true; + r = 1; + goto out; + } + } + + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } + } + + preempt_disable(); + + kvm_x86_ops->prepare_guest_switch(vcpu); + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); + kvm_load_guest_xcr0(vcpu); + + atomic_set(&vcpu->guest_mode, 1); + smp_wmb(); + + local_irq_disable(); + + if (!atomic_read(&vcpu->guest_mode) || vcpu->requests + || need_resched() || signal_pending(current)) { + atomic_set(&vcpu->guest_mode, 0); + smp_wmb(); + local_irq_enable(); + preempt_enable(); + kvm_x86_ops->cancel_injection(vcpu); + r = 1; + goto out; + } + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + + kvm_guest_enter(); + + if (unlikely(vcpu->arch.switch_db_regs)) { + set_debugreg(0, 7); + set_debugreg(vcpu->arch.eff_db[0], 0); + set_debugreg(vcpu->arch.eff_db[1], 1); + set_debugreg(vcpu->arch.eff_db[2], 2); + set_debugreg(vcpu->arch.eff_db[3], 3); + } + + trace_kvm_entry(vcpu->vcpu_id); + kvm_x86_ops->run(vcpu); + + /* + * If the guest has used debug registers, at least dr7 + * will be disabled while returning to the host. + * If we don't have active breakpoints in the host, we don't + * care about the messed up debug address registers. But if + * we have some of them active, restore the old state. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); + + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + + atomic_set(&vcpu->guest_mode, 0); + smp_wmb(); + local_irq_enable(); + + ++vcpu->stat.exits; + + /* + * We must have an instruction between local_irq_enable() and + * kvm_guest_exit(), so the timer interrupt isn't delayed by + * the interrupt shadow. The stat.exits increment will do nicely. + * But we need to prevent reordering, hence this barrier(): + */ + barrier(); + + kvm_guest_exit(); + + preempt_enable(); + + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) { + unsigned long rip = kvm_rip_read(vcpu); + profile_hit(KVM_PROFILING, (void *)rip); + } + + + kvm_lapic_sync_from_vapic(vcpu); + + r = kvm_x86_ops->handle_exit(vcpu); +out: + return r; +} + + +static int __vcpu_run(struct kvm_vcpu *vcpu) +{ + int r; + struct kvm *kvm = vcpu->kvm; + + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + r = kvm_arch_vcpu_reset(vcpu); + if (r) + return r; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + } + + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + vapic_enter(vcpu); + + r = 1; + while (r > 0) { + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted) + r = vcpu_enter_guest(vcpu); + else { + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_block(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) + { + switch(vcpu->arch.mp_state) { + case KVM_MP_STATE_HALTED: + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + case KVM_MP_STATE_RUNNABLE: + vcpu->arch.apf.halted = false; + break; + case KVM_MP_STATE_SIPI_RECEIVED: + default: + r = -EINTR; + break; + } + } + } + + if (r <= 0) + break; + + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + if (kvm_cpu_has_pending_timer(vcpu)) + kvm_inject_pending_timer_irqs(vcpu); + + if (dm_request_for_irq_injection(vcpu)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + } + + kvm_check_async_pf_completion(vcpu); + + if (signal_pending(current)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } + if (need_resched()) { + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_resched(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + } + } + + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + + vapic_exit(vcpu); + + return r; +} + +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + sigset_t sigsaved; + + if (!tsk_used_math(current) && kvm_init_fpu(current)) + return -ENOMEM; + + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + r = -EAGAIN; + goto out; + } + + /* re-sync apic's tpr */ + if (!irqchip_in_kernel(vcpu->kvm)) { + if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { + r = -EINVAL; + goto out; + } + } + + if (vcpu->arch.pio.count || vcpu->mmio_needed) { + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; + } + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r != EMULATE_DONE) { + r = 0; + goto out; + } + } + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) + kvm_register_write(vcpu, VCPU_REGS_RAX, + kvm_run->hypercall.ret); + + r = __vcpu_run(vcpu); + +out: + post_kvm_run_save(vcpu); + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return r; +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); + regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); + regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); + regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); + regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); + regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); + regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); +#ifdef CONFIG_X86_64 + regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); + regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); + regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); + regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); + regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); + regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); + regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); + regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); +#endif + + regs->rip = kvm_rip_read(vcpu); + regs->rflags = kvm_get_rflags(vcpu); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); + kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); + kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); + kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); + kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); + kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); + kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); + kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); +#ifdef CONFIG_X86_64 + kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); + kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); + kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); + kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); + kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); + kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); + kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); + kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); +#endif + + kvm_rip_write(vcpu, regs->rip); + kvm_set_rflags(vcpu, regs->rflags); + + vcpu->arch.exception.pending = false; + + kvm_make_request(KVM_REQ_EVENT, vcpu); + + return 0; +} + +void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct kvm_segment cs; + + kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); + *db = cs.db; + *l = cs.l; +} +EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvm_desc_ptr dt; + + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + kvm_x86_ops->get_idt(vcpu, &dt); + sregs->idt.limit = dt.size; + sregs->idt.base = dt.address; + kvm_x86_ops->get_gdt(vcpu, &dt); + sregs->gdt.limit = dt.size; + sregs->gdt.base = dt.address; + + sregs->cr0 = kvm_read_cr0(vcpu); + sregs->cr2 = vcpu->arch.cr2; + sregs->cr3 = kvm_read_cr3(vcpu); + sregs->cr4 = kvm_read_cr4(vcpu); + sregs->cr8 = kvm_get_cr8(vcpu); + sregs->efer = vcpu->arch.efer; + sregs->apic_base = kvm_get_apic_base(vcpu); + + memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); + + if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) + set_bit(vcpu->arch.interrupt.nr, + (unsigned long *)sregs->interrupt_bitmap); + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + mp_state->mp_state = vcpu->arch.mp_state; + return 0; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu->arch.mp_state = mp_state->mp_state; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; +} + +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, + bool has_error_code, u32 error_code) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int ret; + + init_emulate_ctxt(vcpu); + + ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, + tss_selector, reason, has_error_code, + error_code); + + if (ret) + return EMULATE_FAIL; + + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_task_switch); + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int mmu_reset_needed = 0; + int pending_vec, max_bits; + struct kvm_desc_ptr dt; + + dt.size = sregs->idt.limit; + dt.address = sregs->idt.base; + kvm_x86_ops->set_idt(vcpu, &dt); + dt.size = sregs->gdt.limit; + dt.address = sregs->gdt.base; + kvm_x86_ops->set_gdt(vcpu, &dt); + + vcpu->arch.cr2 = sregs->cr2; + mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; + vcpu->arch.cr3 = sregs->cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + kvm_set_cr8(vcpu, sregs->cr8); + + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; + kvm_x86_ops->set_efer(vcpu, sregs->efer); + kvm_set_apic_base(vcpu, sregs->apic_base); + + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; + kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + vcpu->arch.cr0 = sregs->cr0; + + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + if (sregs->cr4 & X86_CR4_OSXSAVE) + update_cpuid(vcpu); + if (!is_long_mode(vcpu) && is_pae(vcpu)) { + load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + mmu_reset_needed = 1; + } + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + + max_bits = (sizeof sregs->interrupt_bitmap) << 3; + pending_vec = find_first_bit( + (const unsigned long *)sregs->interrupt_bitmap, max_bits); + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, false); + pr_debug("Set back pending irq %d\n", pending_vec); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); + } + + kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + update_cr8_intercept(vcpu); + + /* Older userspace won't unhalt the vcpu on reset. */ + if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && + sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && + !is_protmode(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + kvm_make_request(KVM_REQ_EVENT, vcpu); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + unsigned long rflags; + int i, r; + + if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { + r = -EBUSY; + if (vcpu->arch.exception.pending) + goto out; + if (dbg->control & KVM_GUESTDBG_INJECT_DB) + kvm_queue_exception(vcpu, DB_VECTOR); + else + kvm_queue_exception(vcpu, BP_VECTOR); + } + + /* + * Read rflags as long as potentially injected trace flags are still + * filtered out. + */ + rflags = kvm_get_rflags(vcpu); + + vcpu->guest_debug = dbg->control; + if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) + vcpu->guest_debug = 0; + + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { + for (i = 0; i < KVM_NR_DB_REGS; ++i) + vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; + vcpu->arch.switch_db_regs = + (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); + } else { + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); + } + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); + + /* + * Trigger an rflags update that will inject or remove the trace + * flags. + */ + kvm_set_rflags(vcpu, rflags); + + kvm_x86_ops->set_guest_debug(vcpu, dbg); + + r = 0; + +out: + + return r; +} + +/* + * Translate a guest virtual address to a guest physical address. + */ +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + unsigned long vaddr = tr->linear_address; + gpa_t gpa; + int idx; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + tr->physical_address = gpa; + tr->valid = gpa != UNMAPPED_GVA; + tr->writeable = 1; + tr->usermode = 0; + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct kvm_i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; + + memcpy(fpu->fpr, fxsave->st_space, 128); + fpu->fcw = fxsave->cwd; + fpu->fsw = fxsave->swd; + fpu->ftwx = fxsave->twd; + fpu->last_opcode = fxsave->fop; + fpu->last_ip = fxsave->rip; + fpu->last_dp = fxsave->rdp; + memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct kvm_i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; + + memcpy(fxsave->st_space, fpu->fpr, 128); + fxsave->cwd = fpu->fcw; + fxsave->swd = fpu->fsw; + fxsave->twd = fpu->ftwx; + fxsave->fop = fpu->last_opcode; + fxsave->rip = fpu->last_ip; + fxsave->rdp = fpu->last_dp; + memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + + return 0; +} + +int fx_init(struct kvm_vcpu *vcpu) +{ + int err; + + err = kvm_fpu_alloc(&vcpu->arch.guest_fpu); + if (err) + return err; + + kvm_fpu_finit(&vcpu->arch.guest_fpu); + + /* + * Ensure guest xcr0 is valid for loading + */ + vcpu->arch.xcr0 = XSTATE_FP; + + vcpu->arch.cr0 |= X86_CR0_ET; + + return 0; +} +EXPORT_SYMBOL_GPL(fx_init); + +static void fx_free(struct kvm_vcpu *vcpu) +{ + kvm_fpu_free(&vcpu->arch.guest_fpu); +} + +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_fpu_loaded) + return; + + /* + * Restore all possible states in the guest, + * and assume host would use all available bits. + * Guest xcr0 would be loaded later. + */ + kvm_put_guest_xcr0(vcpu); + vcpu->guest_fpu_loaded = 1; + unlazy_fpu(current); + kvm_fpu_restore_checking(&vcpu->arch.guest_fpu); + trace_kvm_fpu(1); +} + +void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + kvm_put_guest_xcr0(vcpu); + + if (!vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded = 0; + kvm_fpu_save_init(&vcpu->arch.guest_fpu); + ++vcpu->stat.fpu_reload; + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + trace_kvm_fpu(0); +} + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + fx_free(vcpu); + kvm_x86_ops->vcpu_free(vcpu); +} + +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, + unsigned int id) +{ + if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + printk_once(KERN_WARNING + "kvm: SMP vm created on host with unstable TSC; " + "guest TSC will not be reliable\n"); + return kvm_x86_ops->vcpu_create(kvm, id); +} + +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + int r; + + vcpu->arch.mtrr_state.have_fixed = 1; + vcpu_load(vcpu); + r = kvm_arch_vcpu_reset(vcpu); + if (r == 0) + r = kvm_mmu_setup(vcpu); + vcpu_put(vcpu); + if (r < 0) + goto free_vcpu; + + return 0; +free_vcpu: + kvm_x86_ops->vcpu_free(vcpu); + return r; +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->arch.apf.msr_val = 0; + + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); + + fx_free(vcpu); + kvm_x86_ops->vcpu_free(vcpu); +} + +int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) +{ + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = false; + + vcpu->arch.switch_db_regs = 0; + memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); + vcpu->arch.dr6 = DR6_FIXED_1; + vcpu->arch.dr7 = DR7_FIXED_1; + + kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu->arch.apf.msr_val = 0; + + kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + vcpu->arch.apf.halted = false; + + return kvm_x86_ops->vcpu_reset(vcpu); +} + +int kvm_arch_hardware_enable(void *garbage) +{ + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; + + kvm_shared_msr_cpu_online(); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu->cpu == smp_processor_id()) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + return kvm_x86_ops->hardware_enable(garbage); +} + +void kvm_arch_hardware_disable(void *garbage) +{ + kvm_x86_ops->hardware_disable(garbage); + drop_user_return_notifiers(garbage); +} + +int kvm_arch_hardware_setup(void) +{ + return kvm_x86_ops->hardware_setup(); +} + +void kvm_arch_hardware_unsetup(void) +{ + kvm_x86_ops->hardware_unsetup(); +} + +void kvm_arch_check_processor_compat(void *rtn) +{ + kvm_x86_ops->check_processor_compatibility(rtn); +} + +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct page *page; + struct kvm *kvm; + int r; + + BUG_ON(vcpu->kvm == NULL); + kvm = vcpu->kvm; + + vcpu->arch.emulate_ctxt.ops = &emulate_ops; + vcpu->arch.walk_mmu = &vcpu->arch.mmu; + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.translate_gpa = translate_gpa; + vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; + if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail; + } + vcpu->arch.pio_data = page_address(page); + + if (!kvm->arch.virtual_tsc_khz) + kvm_arch_set_tsc_khz(kvm, max_tsc_khz); + + r = kvm_mmu_create(vcpu); + if (r < 0) + goto fail_free_pio_data; + + if (irqchip_in_kernel(kvm)) { + r = kvm_create_lapic(vcpu); + if (r < 0) + goto fail_mmu_destroy; + } + + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + GFP_KERNEL); + if (!vcpu->arch.mce_banks) { + r = -ENOMEM; + goto fail_free_lapic; + } + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) + goto fail_free_mce_banks; + + kvm_async_pf_hash_reset(vcpu); + + return 0; +fail_free_mce_banks: + kfree(vcpu->arch.mce_banks); +fail_free_lapic: + kvm_free_lapic(vcpu); +fail_mmu_destroy: + kvm_mmu_destroy(vcpu); +fail_free_pio_data: + free_page((unsigned long)vcpu->arch.pio_data); +fail: + return r; +} + +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + int idx; + + kfree(vcpu->arch.mce_banks); + kvm_free_lapic(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_mmu_destroy(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + free_page((unsigned long)vcpu->arch.pio_data); +} + +int kvm_arch_init_vm(struct kvm *kvm) +{ + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); + + /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ + set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); + + spin_lock_init(&kvm->arch.tsc_write_lock); + + return 0; +} + +static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); +} + +static void kvm_free_vcpus(struct kvm *kvm) +{ + unsigned int i; + struct kvm_vcpu *vcpu; + + /* + * Unpin any mmu pages first. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_clear_async_pf_completion_queue(vcpu); + kvm_unload_vcpu_mmu(vcpu); + } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arch_vcpu_free(vcpu); + + mutex_lock(&kvm->lock); + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) + kvm->vcpus[i] = NULL; + + atomic_set(&kvm->online_vcpus, 0); + mutex_unlock(&kvm->lock); +} + +void kvm_arch_sync_events(struct kvm *kvm) +{ + kvm_free_all_assigned_devices(kvm); + kvm_free_pit(kvm); +} + +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + kvm_iommu_unmap_guest(kvm); + kfree(kvm->arch.vpic); + kfree(kvm->arch.vioapic); + kvm_free_vcpus(kvm); + if (kvm->arch.apic_access_page) + put_page(kvm->arch.apic_access_page); + if (kvm->arch.ept_identity_pagetable) + put_page(kvm->arch.ept_identity_pagetable); +} + +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) +{ + int npages = memslot->npages; + int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; + + /* Prevent internal slot pages from being moved by fork()/COW. */ + if (memslot->id >= KVM_MEMORY_SLOTS) + map_flags = MAP_SHARED | MAP_ANONYMOUS; + + /*To keep backward compatibility with older userspace, + *x86 needs to hanlde !user_alloc case. + */ + if (!user_alloc) { + if (npages && !old.rmap) { + unsigned long userspace_addr; + + down_write(¤t->mm->mmap_sem); + userspace_addr = do_mmap(NULL, 0, + npages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + map_flags, + 0); + up_write(¤t->mm->mmap_sem); + + if (IS_ERR((void *)userspace_addr)) + return PTR_ERR((void *)userspace_addr); + + memslot->userspace_addr = userspace_addr; + } + } + + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + + int npages = mem->memory_size >> PAGE_SHIFT; + + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + up_write(¤t->mm->mmap_sem); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + + spin_lock(&kvm->mmu_lock); + if (!kvm->arch.n_requested_mmu_pages) { + unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + } + + kvm_mmu_slot_remove_write_access(kvm, mem->slot); + spin_unlock(&kvm->mmu_lock); +} + +void kvm_arch_flush_shadow(struct kvm *kvm) +{ + kvm_mmu_zap_all(kvm); + kvm_reload_remote_mmus(kvm); +} + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted) + || !list_empty_careful(&vcpu->async_pf.done) + || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || vcpu->arch.nmi_pending || + (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_has_interrupt(vcpu)); +} + +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + int me; + int cpu = vcpu->cpu; + + if (waitqueue_active(&vcpu->wq)) { + wake_up_interruptible(&vcpu->wq); + ++vcpu->stat.halt_wakeup; + } + + me = get_cpu(); + if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) + if (atomic_xchg(&vcpu->guest_mode, 0)) + kvm_smp_send_reschedule(cpu); + put_cpu(); +} + +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->interrupt_allowed(vcpu); +} + +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) +{ + unsigned long current_rip = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); + + return current_rip == linear_rip; +} +EXPORT_SYMBOL_GPL(kvm_is_linear_rip); + +unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) +{ + unsigned long rflags; + + rflags = kvm_x86_ops->get_rflags(vcpu); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + rflags &= ~X86_EFLAGS_TF; + return rflags; +} +EXPORT_SYMBOL_GPL(kvm_get_rflags); + +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) + rflags |= X86_EFLAGS_TF; + kvm_x86_ops->set_rflags(vcpu, rflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); +} +EXPORT_SYMBOL_GPL(kvm_set_rflags); + +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) +{ + int r; + + if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || + is_error_page(work->page)) + return; + + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + return; + + if (!vcpu->arch.mmu.direct_map && + work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) + return; + + vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); +} + +static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) +{ + return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); +} + +static inline u32 kvm_async_pf_next_probe(u32 key) +{ + return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); +} + +static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u32 key = kvm_async_pf_hash_fn(gfn); + + while (vcpu->arch.apf.gfns[key] != ~0) + key = kvm_async_pf_next_probe(key); + + vcpu->arch.apf.gfns[key] = gfn; +} + +static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + int i; + u32 key = kvm_async_pf_hash_fn(gfn); + + for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && + (vcpu->arch.apf.gfns[key] != gfn && + vcpu->arch.apf.gfns[key] != ~0); i++) + key = kvm_async_pf_next_probe(key); + + return key; +} + +bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; +} + +static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u32 i, j, k; + + i = j = kvm_async_pf_gfn_slot(vcpu, gfn); + while (true) { + vcpu->arch.apf.gfns[i] = ~0; + do { + j = kvm_async_pf_next_probe(j); + if (vcpu->arch.apf.gfns[j] == ~0) + return; + k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); + /* + * k lies cyclically in ]i,j] + * | i.k.j | + * |....j i.k.| or |.k..j i...| + */ + } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); + vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; + i = j; + } +} + +static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) +{ + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, + sizeof(val)); +} + +void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + struct x86_exception fault; + + trace_kvm_async_pf_not_present(work->arch.token, work->gva); + kvm_add_async_pf_gfn(vcpu, work->arch.gfn); + + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || + (vcpu->arch.apf.send_user_only && + kvm_x86_ops->get_cpl(vcpu) == 0)) + kvm_make_request(KVM_REQ_APF_HALT, vcpu); + else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = 0; + fault.nested_page_fault = false; + fault.address = work->arch.token; + kvm_inject_page_fault(vcpu, &fault); + } +} + +void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + struct x86_exception fault; + + trace_kvm_async_pf_ready(work->arch.token, work->gva); + if (is_error_page(work->page)) + work->arch.token = ~0; /* broadcast wakeup */ + else + kvm_del_async_pf_gfn(vcpu, work->arch.gfn); + + if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && + !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = 0; + fault.nested_page_fault = false; + fault.address = work->arch.token; + kvm_inject_page_fault(vcpu, &fault); + } + vcpu->arch.apf.halted = false; +} + +bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) + return true; + else + return !kvm_event_needs_reinjection(vcpu) && + kvm_x86_ops->interrupt_allowed(vcpu); +} + +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); diff --git a/linux/x86/x86.h b/linux/x86/x86.h new file mode 100644 index 0000000..c600da8 --- /dev/null +++ b/linux/x86/x86.h @@ -0,0 +1,84 @@ +#ifndef ARCH_X86_KVM_X86_H +#define ARCH_X86_KVM_X86_H + +#include <linux/kvm_host.h> +#include "kvm_cache_regs.h" + +static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.exception.pending = false; +} + +static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, + bool soft) +{ + vcpu->arch.interrupt.pending = true; + vcpu->arch.interrupt.soft = soft; + vcpu->arch.interrupt.nr = vector; +} + +static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.interrupt.pending = false; +} + +static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || + vcpu->arch.nmi_injected; +} + +static inline bool kvm_exception_is_soft(unsigned int nr) +{ + return (nr == BP_VECTOR) || (nr == OF_VECTOR); +} + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); + +static inline bool is_protmode(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PE); +} + +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->arch.efer & EFER_LMA; +#else + return 0; +#endif +} + +static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); +} + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + +void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); +void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); + +void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); + +#endif |