summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJosh Wilsdon <jwilsdon@joyent.com>2011-04-06 16:35:09 -0700
committerJosh Wilsdon <jwilsdon@joyent.com>2011-04-06 16:35:09 -0700
commitc5e99aab98c3a8ddb8e0e2953c1a3e534d67ca4f (patch)
tree9beb629b7abbb44c7385607ac2b6f842ca4ac18b
parent8ddb3439b2f21ccfb06e6d0b4a8a1fb0f944cf07 (diff)
downloadillumos-kvm-c5e99aab98c3a8ddb8e0e2953c1a3e534d67ca4f.tar.gz
[HVM-29] remove silly .gitignore that was hiding files from the previous commit.
-rw-r--r--linux/.gitignore74
-rw-r--r--linux/include/asm-ia64/kvm.h304
-rw-r--r--linux/include/asm-ia64/kvm_host.h639
-rw-r--r--linux/include/asm-ia64/kvm_para.h71
-rw-r--r--linux/include/asm-x86/hyperv.h233
-rw-r--r--linux/include/asm-x86/kvm.h364
-rw-r--r--linux/include/asm-x86/kvm_emulate.h318
-rw-r--r--linux/include/asm-x86/kvm_host.h888
-rw-r--r--linux/include/asm-x86/kvm_para.h233
-rw-r--r--linux/include/asm-x86/svm.h392
-rw-r--r--linux/include/asm-x86/virtext.h172
-rw-r--r--linux/include/asm-x86/vmx.h469
-rw-r--r--linux/include/linux/kvm.h838
-rw-r--r--linux/include/linux/kvm_host.h778
-rw-r--r--linux/include/linux/kvm_para.h78
-rw-r--r--linux/include/linux/kvm_types.h117
-rw-r--r--linux/include/trace/events/kvm.h352
-rw-r--r--linux/usr/include/asm-ia64/kvm.h264
-rw-r--r--linux/usr/include/asm-ia64/kvm_para.h23
-rw-r--r--linux/usr/include/asm-x86/hyperv.h193
-rw-r--r--linux/usr/include/asm-x86/kvm.h324
-rw-r--r--linux/usr/include/asm-x86/kvm_para.h79
-rw-r--r--linux/usr/include/linux/kvm.h798
-rw-r--r--linux/usr/include/linux/kvm_para.h29
-rw-r--r--linux/x86/assigned-dev.c827
-rw-r--r--linux/x86/coalesced_mmio.c231
-rw-r--r--linux/x86/coalesced_mmio.h39
-rw-r--r--linux/x86/emulate.c3771
-rw-r--r--linux/x86/eventfd.c705
-rw-r--r--linux/x86/i8254.c803
-rw-r--r--linux/x86/i8254.h60
-rw-r--r--linux/x86/i8259.c645
-rw-r--r--linux/x86/ioapic.c481
-rw-r--r--linux/x86/ioapic.h83
-rw-r--r--linux/x86/iodev.h70
-rw-r--r--linux/x86/iommu.c356
-rw-r--r--linux/x86/irq.c136
-rw-r--r--linux/x86/irq.h106
-rw-r--r--linux/x86/irq_comm.c514
-rw-r--r--linux/x86/kvm_cache_regs.h109
-rw-r--r--linux/x86/kvm_main.c2679
-rw-r--r--linux/x86/kvm_timer.h16
-rw-r--r--linux/x86/lapic.c1328
-rw-r--r--linux/x86/lapic.h59
-rw-r--r--linux/x86/mmu.c3887
-rw-r--r--linux/x86/mmu.h79
-rw-r--r--linux/x86/mmu_audit.c344
-rw-r--r--linux/x86/mmutrace.h225
-rw-r--r--linux/x86/paging_tmpl.h839
-rw-r--r--linux/x86/svm.c3993
-rw-r--r--linux/x86/timer.c105
-rw-r--r--linux/x86/trace.h709
-rw-r--r--linux/x86/tss.h59
-rw-r--r--linux/x86/vmx.c4549
-rw-r--r--linux/x86/x86.c6404
-rw-r--r--linux/x86/x86.h84
56 files changed, 42251 insertions, 74 deletions
diff --git a/linux/.gitignore b/linux/.gitignore
deleted file mode 100644
index 38bff83..0000000
--- a/linux/.gitignore
+++ /dev/null
@@ -1,74 +0,0 @@
-*.o
-*.d
-*~
-*.flat
-*.a
-.*.cmd
-*.ko
-*.mod.c
-config.mak
-kvm-kmod-config.h
-modules.order
-Module.symvers
-Modules.symvers
-Module.markers
-.tmp_versions
-.tmp.kvm-kmod.*
-include-compat/asm
-include
-x86/modules.order
-x86/i825[49].[ch]
-x86/kvm_main.c
-x86/kvm_svm.h
-x86/vmx.[ch]
-x86/svm.[ch]
-x86/mmu.[ch]
-x86/paging_tmpl.h
-x86/ioapic.[ch]
-x86/iodev.h
-x86/irq.[ch]
-x86/lapic.[ch]
-x86/tss.h
-x86/x86.[ch]
-x86/coalesced_mmio.[ch]
-x86/kvm_cache_regs.h
-x86/irq_comm.c
-x86/timer.c
-x86/kvm_timer.h
-x86/iommu.c
-x86/svm-trace.h
-x86/trace-arch.h
-x86/trace.h
-x86/vmx-trace.h
-x86/assigned-dev.c
-x86/emulate.c
-x86/eventfd.c
-x86/mmutrace.h
-ia64/asm-offsets.c
-ia64/coalesced_mmio.[ch]
-ia64/ioapic.[ch]
-ia64/iodev.h
-ia64/iommu.c
-ia64/irq.h
-ia64/irq_comm.c
-ia64/kvm-ia64.c
-ia64/kvm_fw.c
-ia64/kvm_lib.c
-ia64/kvm_main.c
-ia64/kvm_minstate.h
-ia64/lapic.h
-ia64/memcpy.S
-ia64/memset.S
-ia64/misc.h
-ia64/mmio.c
-ia64/optvfault.S
-ia64/process.c
-ia64/trampoline.S
-ia64/vcpu.[ch]
-ia64/vmm.c
-ia64/vmm_ivt.S
-ia64/vti.h
-ia64/vtlb.c
-ia64/assigned-dev.c
-ia64/eventfd.c
-.stgit-*
diff --git a/linux/include/asm-ia64/kvm.h b/linux/include/asm-ia64/kvm.h
new file mode 100644
index 0000000..ce31fac
--- /dev/null
+++ b/linux/include/asm-ia64/kvm.h
@@ -0,0 +1,304 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#ifndef __ASM_IA64_KVM_H
+#define __ASM_IA64_KVM_H
+
+/*
+ * kvm structure definitions for ia64
+ *
+ * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <asm/types.h>
+#include <linux/ioctl.h>
+
+/* Select x86 specific features in <linux/kvm.h> */
+#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_DEVICE_ASSIGNMENT
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+#define KVM_IOAPIC_NUM_PINS 48
+
+struct kvm_ioapic_state {
+ __u64 base_address;
+ __u32 ioregsel;
+ __u32 id;
+ __u32 irr;
+ __u32 pad;
+ union {
+ __u64 bits;
+ struct {
+ __u8 vector;
+ __u8 delivery_mode:3;
+ __u8 dest_mode:1;
+ __u8 delivery_status:1;
+ __u8 polarity:1;
+ __u8 remote_irr:1;
+ __u8 trig_mode:1;
+ __u8 mask:1;
+ __u8 reserve:7;
+ __u8 reserved[4];
+ __u8 dest_id;
+ } fields;
+ } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER 0
+#define KVM_IRQCHIP_PIC_SLAVE 1
+#define KVM_IRQCHIP_IOAPIC 2
+#define KVM_NR_IRQCHIPS 3
+
+#define KVM_CONTEXT_SIZE 8*1024
+
+struct kvm_fpreg {
+ union {
+ unsigned long bits[2];
+ long double __dummy; /* force 16-byte alignment */
+ } u;
+};
+
+union context {
+ /* 8K size */
+ char dummy[KVM_CONTEXT_SIZE];
+ struct {
+ unsigned long psr;
+ unsigned long pr;
+ unsigned long caller_unat;
+ unsigned long pad;
+ unsigned long gr[32];
+ unsigned long ar[128];
+ unsigned long br[8];
+ unsigned long cr[128];
+ unsigned long rr[8];
+ unsigned long ibr[8];
+ unsigned long dbr[8];
+ unsigned long pkr[8];
+ struct kvm_fpreg fr[128];
+ };
+};
+
+struct thash_data {
+ union {
+ struct {
+ unsigned long p : 1; /* 0 */
+ unsigned long rv1 : 1; /* 1 */
+ unsigned long ma : 3; /* 2-4 */
+ unsigned long a : 1; /* 5 */
+ unsigned long d : 1; /* 6 */
+ unsigned long pl : 2; /* 7-8 */
+ unsigned long ar : 3; /* 9-11 */
+ unsigned long ppn : 38; /* 12-49 */
+ unsigned long rv2 : 2; /* 50-51 */
+ unsigned long ed : 1; /* 52 */
+ unsigned long ig1 : 11; /* 53-63 */
+ };
+ struct {
+ unsigned long __rv1 : 53; /* 0-52 */
+ unsigned long contiguous : 1; /*53 */
+ unsigned long tc : 1; /* 54 TR or TC */
+ unsigned long cl : 1;
+ /* 55 I side or D side cache line */
+ unsigned long len : 4; /* 56-59 */
+ unsigned long io : 1; /* 60 entry is for io or not */
+ unsigned long nomap : 1;
+ /* 61 entry cann't be inserted into machine TLB.*/
+ unsigned long checked : 1;
+ /* 62 for VTLB/VHPT sanity check */
+ unsigned long invalid : 1;
+ /* 63 invalid entry */
+ };
+ unsigned long page_flags;
+ }; /* same for VHPT and TLB */
+
+ union {
+ struct {
+ unsigned long rv3 : 2;
+ unsigned long ps : 6;
+ unsigned long key : 24;
+ unsigned long rv4 : 32;
+ };
+ unsigned long itir;
+ };
+ union {
+ struct {
+ unsigned long ig2 : 12;
+ unsigned long vpn : 49;
+ unsigned long vrn : 3;
+ };
+ unsigned long ifa;
+ unsigned long vadr;
+ struct {
+ unsigned long tag : 63;
+ unsigned long ti : 1;
+ };
+ unsigned long etag;
+ };
+ union {
+ struct thash_data *next;
+ unsigned long rid;
+ unsigned long gpaddr;
+ };
+};
+
+#define NITRS 8
+#define NDTRS 8
+
+struct saved_vpd {
+ unsigned long vhpi;
+ unsigned long vgr[16];
+ unsigned long vbgr[16];
+ unsigned long vnat;
+ unsigned long vbnat;
+ unsigned long vcpuid[5];
+ unsigned long vpsr;
+ unsigned long vpr;
+ union {
+ unsigned long vcr[128];
+ struct {
+ unsigned long dcr;
+ unsigned long itm;
+ unsigned long iva;
+ unsigned long rsv1[5];
+ unsigned long pta;
+ unsigned long rsv2[7];
+ unsigned long ipsr;
+ unsigned long isr;
+ unsigned long rsv3;
+ unsigned long iip;
+ unsigned long ifa;
+ unsigned long itir;
+ unsigned long iipa;
+ unsigned long ifs;
+ unsigned long iim;
+ unsigned long iha;
+ unsigned long rsv4[38];
+ unsigned long lid;
+ unsigned long ivr;
+ unsigned long tpr;
+ unsigned long eoi;
+ unsigned long irr[4];
+ unsigned long itv;
+ unsigned long pmv;
+ unsigned long cmcv;
+ unsigned long rsv5[5];
+ unsigned long lrr0;
+ unsigned long lrr1;
+ unsigned long rsv6[46];
+ };
+ };
+};
+
+struct kvm_regs {
+ struct saved_vpd vpd;
+ /*Arch-regs*/
+ int mp_state;
+ unsigned long vmm_rr;
+ /* TR and TC. */
+ struct thash_data itrs[NITRS];
+ struct thash_data dtrs[NDTRS];
+ /* Bit is set if there is a tr/tc for the region. */
+ unsigned char itr_regions;
+ unsigned char dtr_regions;
+ unsigned char tc_regions;
+
+ char irq_check;
+ unsigned long saved_itc;
+ unsigned long itc_check;
+ unsigned long timer_check;
+ unsigned long timer_pending;
+ unsigned long last_itc;
+
+ unsigned long vrr[8];
+ unsigned long ibr[8];
+ unsigned long dbr[8];
+ unsigned long insvc[4]; /* Interrupt in service. */
+ unsigned long xtp;
+
+ unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */
+ unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */
+ unsigned long metaphysical_saved_rr0; /* from kvm_arch */
+ unsigned long metaphysical_saved_rr4; /* from kvm_arch */
+ unsigned long fp_psr; /*used for lazy float register */
+ unsigned long saved_gp;
+ /*for phycial emulation */
+
+ union context saved_guest;
+
+ unsigned long reserved[64]; /* for future use */
+};
+
+struct kvm_sregs {
+};
+
+struct kvm_fpu {
+};
+
+#define KVM_IA64_VCPU_STACK_SHIFT 16
+#define KVM_IA64_VCPU_STACK_SIZE (1UL << KVM_IA64_VCPU_STACK_SHIFT)
+
+struct kvm_ia64_vcpu_stack {
+ unsigned char stack[KVM_IA64_VCPU_STACK_SIZE];
+};
+
+struct kvm_debug_exit_arch {
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
+#endif
diff --git a/linux/include/asm-ia64/kvm_host.h b/linux/include/asm-ia64/kvm_host.h
new file mode 100644
index 0000000..91e5de5
--- /dev/null
+++ b/linux/include/asm-ia64/kvm_host.h
@@ -0,0 +1,639 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * kvm_host.h: used for kvm module, and hold ia64-specific sections.
+ *
+ * Copyright (C) 2007, Intel Corporation.
+ *
+ * Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#ifndef __ASM_KVM_HOST_H
+#define __ASM_KVM_HOST_H
+
+#define KVM_MEMORY_SLOTS 32
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
+/* define exit reasons from vmm to kvm*/
+#define EXIT_REASON_VM_PANIC 0
+#define EXIT_REASON_MMIO_INSTRUCTION 1
+#define EXIT_REASON_PAL_CALL 2
+#define EXIT_REASON_SAL_CALL 3
+#define EXIT_REASON_SWITCH_RR6 4
+#define EXIT_REASON_VM_DESTROY 5
+#define EXIT_REASON_EXTERNAL_INTERRUPT 6
+#define EXIT_REASON_IPI 7
+#define EXIT_REASON_PTC_G 8
+#define EXIT_REASON_DEBUG 20
+
+/*Define vmm address space and vm data space.*/
+#define KVM_VMM_SIZE (__IA64_UL_CONST(16)<<20)
+#define KVM_VMM_SHIFT 24
+#define KVM_VMM_BASE 0xD000000000000000
+#define VMM_SIZE (__IA64_UL_CONST(8)<<20)
+
+/*
+ * Define vm_buffer, used by PAL Services, base address.
+ * Note: vm_buffer is in the VMM-BLOCK, the size must be < 8M
+ */
+#define KVM_VM_BUFFER_BASE (KVM_VMM_BASE + VMM_SIZE)
+#define KVM_VM_BUFFER_SIZE (__IA64_UL_CONST(8)<<20)
+
+/*
+ * kvm guest's data area looks as follow:
+ *
+ * +----------------------+ ------- KVM_VM_DATA_SIZE
+ * | vcpu[n]'s data | | ___________________KVM_STK_OFFSET
+ * | | | / |
+ * | .......... | | /vcpu's struct&stack |
+ * | .......... | | /---------------------|---- 0
+ * | vcpu[5]'s data | | / vpd |
+ * | vcpu[4]'s data | |/-----------------------|
+ * | vcpu[3]'s data | / vtlb |
+ * | vcpu[2]'s data | /|------------------------|
+ * | vcpu[1]'s data |/ | vhpt |
+ * | vcpu[0]'s data |____________________________|
+ * +----------------------+ |
+ * | memory dirty log | |
+ * +----------------------+ |
+ * | vm's data struct | |
+ * +----------------------+ |
+ * | | |
+ * | | |
+ * | | |
+ * | | |
+ * | | |
+ * | | |
+ * | | |
+ * | vm's p2m table | |
+ * | | |
+ * | | |
+ * | | | |
+ * vm's data->| | | |
+ * +----------------------+ ------- 0
+ * To support large memory, needs to increase the size of p2m.
+ * To support more vcpus, needs to ensure it has enough space to
+ * hold vcpus' data.
+ */
+
+#define KVM_VM_DATA_SHIFT 26
+#define KVM_VM_DATA_SIZE (__IA64_UL_CONST(1) << KVM_VM_DATA_SHIFT)
+#define KVM_VM_DATA_BASE (KVM_VMM_BASE + KVM_VM_DATA_SIZE)
+
+#define KVM_P2M_BASE KVM_VM_DATA_BASE
+#define KVM_P2M_SIZE (__IA64_UL_CONST(24) << 20)
+
+#define VHPT_SHIFT 16
+#define VHPT_SIZE (__IA64_UL_CONST(1) << VHPT_SHIFT)
+#define VHPT_NUM_ENTRIES (__IA64_UL_CONST(1) << (VHPT_SHIFT-5))
+
+#define VTLB_SHIFT 16
+#define VTLB_SIZE (__IA64_UL_CONST(1) << VTLB_SHIFT)
+#define VTLB_NUM_ENTRIES (1UL << (VHPT_SHIFT-5))
+
+#define VPD_SHIFT 16
+#define VPD_SIZE (__IA64_UL_CONST(1) << VPD_SHIFT)
+
+#define VCPU_STRUCT_SHIFT 16
+#define VCPU_STRUCT_SIZE (__IA64_UL_CONST(1) << VCPU_STRUCT_SHIFT)
+
+/*
+ * This must match KVM_IA64_VCPU_STACK_{SHIFT,SIZE} arch/ia64/include/asm/kvm.h
+ */
+#define KVM_STK_SHIFT 16
+#define KVM_STK_OFFSET (__IA64_UL_CONST(1)<< KVM_STK_SHIFT)
+
+#define KVM_VM_STRUCT_SHIFT 19
+#define KVM_VM_STRUCT_SIZE (__IA64_UL_CONST(1) << KVM_VM_STRUCT_SHIFT)
+
+#define KVM_MEM_DIRY_LOG_SHIFT 19
+#define KVM_MEM_DIRTY_LOG_SIZE (__IA64_UL_CONST(1) << KVM_MEM_DIRY_LOG_SHIFT)
+
+#ifndef __ASSEMBLY__
+
+/*Define the max vcpus and memory for Guests.*/
+#define KVM_MAX_VCPUS (KVM_VM_DATA_SIZE - KVM_P2M_SIZE - KVM_VM_STRUCT_SIZE -\
+ KVM_MEM_DIRTY_LOG_SIZE) / sizeof(struct kvm_vcpu_data)
+#define KVM_MAX_MEM_SIZE (KVM_P2M_SIZE >> 3 << PAGE_SHIFT)
+
+#define VMM_LOG_LEN 256
+
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/pal.h>
+#include <asm/sal.h>
+#include <asm/page.h>
+
+struct kvm_vcpu_data {
+ char vcpu_vhpt[VHPT_SIZE];
+ char vcpu_vtlb[VTLB_SIZE];
+ char vcpu_vpd[VPD_SIZE];
+ char vcpu_struct[VCPU_STRUCT_SIZE];
+};
+
+struct kvm_vm_data {
+ char kvm_p2m[KVM_P2M_SIZE];
+ char kvm_vm_struct[KVM_VM_STRUCT_SIZE];
+ char kvm_mem_dirty_log[KVM_MEM_DIRTY_LOG_SIZE];
+ struct kvm_vcpu_data vcpu_data[KVM_MAX_VCPUS];
+};
+
+#define VCPU_BASE(n) (KVM_VM_DATA_BASE + \
+ offsetof(struct kvm_vm_data, vcpu_data[n]))
+#define KVM_VM_BASE (KVM_VM_DATA_BASE + \
+ offsetof(struct kvm_vm_data, kvm_vm_struct))
+#define KVM_MEM_DIRTY_LOG_BASE KVM_VM_DATA_BASE + \
+ offsetof(struct kvm_vm_data, kvm_mem_dirty_log)
+
+#define VHPT_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vhpt))
+#define VTLB_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vtlb))
+#define VPD_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vpd))
+#define VCPU_STRUCT_BASE(n) (VCPU_BASE(n) + \
+ offsetof(struct kvm_vcpu_data, vcpu_struct))
+
+/*IO section definitions*/
+#define IOREQ_READ 1
+#define IOREQ_WRITE 0
+
+#define STATE_IOREQ_NONE 0
+#define STATE_IOREQ_READY 1
+#define STATE_IOREQ_INPROCESS 2
+#define STATE_IORESP_READY 3
+
+/*Guest Physical address layout.*/
+#define GPFN_MEM (0UL << 60) /* Guest pfn is normal mem */
+#define GPFN_FRAME_BUFFER (1UL << 60) /* VGA framebuffer */
+#define GPFN_LOW_MMIO (2UL << 60) /* Low MMIO range */
+#define GPFN_PIB (3UL << 60) /* PIB base */
+#define GPFN_IOSAPIC (4UL << 60) /* IOSAPIC base */
+#define GPFN_LEGACY_IO (5UL << 60) /* Legacy I/O base */
+#define GPFN_GFW (6UL << 60) /* Guest Firmware */
+#define GPFN_PHYS_MMIO (7UL << 60) /* Directed MMIO Range */
+
+#define GPFN_IO_MASK (7UL << 60) /* Guest pfn is I/O type */
+#define GPFN_INV_MASK (1UL << 63) /* Guest pfn is invalid */
+#define INVALID_MFN (~0UL)
+#define MEM_G (1UL << 30)
+#define MEM_M (1UL << 20)
+#define MMIO_START (3 * MEM_G)
+#define MMIO_SIZE (512 * MEM_M)
+#define VGA_IO_START 0xA0000UL
+#define VGA_IO_SIZE 0x20000
+#define LEGACY_IO_START (MMIO_START + MMIO_SIZE)
+#define LEGACY_IO_SIZE (64 * MEM_M)
+#define IO_SAPIC_START 0xfec00000UL
+#define IO_SAPIC_SIZE 0x100000
+#define PIB_START 0xfee00000UL
+#define PIB_SIZE 0x200000
+#define GFW_START (4 * MEM_G - 16 * MEM_M)
+#define GFW_SIZE (16 * MEM_M)
+
+/*Deliver mode, defined for ioapic.c*/
+#define dest_Fixed IOSAPIC_FIXED
+#define dest_LowestPrio IOSAPIC_LOWEST_PRIORITY
+
+#define NMI_VECTOR 2
+#define ExtINT_VECTOR 0
+#define NULL_VECTOR (-1)
+#define IA64_SPURIOUS_INT_VECTOR 0x0f
+
+#define VCPU_LID(v) (((u64)(v)->vcpu_id) << 24)
+
+/*
+ *Delivery mode
+ */
+#define SAPIC_DELIV_SHIFT 8
+#define SAPIC_FIXED 0x0
+#define SAPIC_LOWEST_PRIORITY 0x1
+#define SAPIC_PMI 0x2
+#define SAPIC_NMI 0x4
+#define SAPIC_INIT 0x5
+#define SAPIC_EXTINT 0x7
+
+/*
+ * vcpu->requests bit members for arch
+ */
+#define KVM_REQ_PTC_G 32
+#define KVM_REQ_RESUME 33
+
+#define KVM_HPAGE_GFN_SHIFT(x) 0
+#define KVM_NR_PAGE_SIZES 1
+#define KVM_PAGES_PER_HPAGE(x) 1
+
+struct kvm;
+struct kvm_vcpu;
+
+struct kvm_mmio_req {
+ uint64_t addr; /* physical address */
+ uint64_t size; /* size in bytes */
+ uint64_t data; /* data (or paddr of data) */
+ uint8_t state:4;
+ uint8_t dir:1; /* 1=read, 0=write */
+};
+
+/*Pal data struct */
+struct kvm_pal_call{
+ /*In area*/
+ uint64_t gr28;
+ uint64_t gr29;
+ uint64_t gr30;
+ uint64_t gr31;
+ /*Out area*/
+ struct ia64_pal_retval ret;
+};
+
+/* Sal data structure */
+struct kvm_sal_call{
+ /*In area*/
+ uint64_t in0;
+ uint64_t in1;
+ uint64_t in2;
+ uint64_t in3;
+ uint64_t in4;
+ uint64_t in5;
+ uint64_t in6;
+ uint64_t in7;
+ struct sal_ret_values ret;
+};
+
+/*Guest change rr6*/
+struct kvm_switch_rr6 {
+ uint64_t old_rr;
+ uint64_t new_rr;
+};
+
+union ia64_ipi_a{
+ unsigned long val;
+ struct {
+ unsigned long rv : 3;
+ unsigned long ir : 1;
+ unsigned long eid : 8;
+ unsigned long id : 8;
+ unsigned long ib_base : 44;
+ };
+};
+
+union ia64_ipi_d {
+ unsigned long val;
+ struct {
+ unsigned long vector : 8;
+ unsigned long dm : 3;
+ unsigned long ig : 53;
+ };
+};
+
+/*ipi check exit data*/
+struct kvm_ipi_data{
+ union ia64_ipi_a addr;
+ union ia64_ipi_d data;
+};
+
+/*global purge data*/
+struct kvm_ptc_g {
+ unsigned long vaddr;
+ unsigned long rr;
+ unsigned long ps;
+ struct kvm_vcpu *vcpu;
+};
+
+/*Exit control data */
+struct exit_ctl_data{
+ uint32_t exit_reason;
+ uint32_t vm_status;
+ union {
+ struct kvm_mmio_req ioreq;
+ struct kvm_pal_call pal_data;
+ struct kvm_sal_call sal_data;
+ struct kvm_switch_rr6 rr_data;
+ struct kvm_ipi_data ipi_data;
+ struct kvm_ptc_g ptc_g_data;
+ } u;
+};
+
+union pte_flags {
+ unsigned long val;
+ struct {
+ unsigned long p : 1; /*0 */
+ unsigned long : 1; /* 1 */
+ unsigned long ma : 3; /* 2-4 */
+ unsigned long a : 1; /* 5 */
+ unsigned long d : 1; /* 6 */
+ unsigned long pl : 2; /* 7-8 */
+ unsigned long ar : 3; /* 9-11 */
+ unsigned long ppn : 38; /* 12-49 */
+ unsigned long : 2; /* 50-51 */
+ unsigned long ed : 1; /* 52 */
+ };
+};
+
+union ia64_pta {
+ unsigned long val;
+ struct {
+ unsigned long ve : 1;
+ unsigned long reserved0 : 1;
+ unsigned long size : 6;
+ unsigned long vf : 1;
+ unsigned long reserved1 : 6;
+ unsigned long base : 49;
+ };
+};
+
+struct thash_cb {
+ /* THASH base information */
+ struct thash_data *hash; /* hash table pointer */
+ union ia64_pta pta;
+ int num;
+};
+
+struct kvm_vcpu_stat {
+};
+
+struct kvm_vcpu_arch {
+ int launched;
+ int last_exit;
+ int last_run_cpu;
+ int vmm_tr_slot;
+ int vm_tr_slot;
+ int sn_rtc_tr_slot;
+
+#define KVM_MP_STATE_RUNNABLE 0
+#define KVM_MP_STATE_UNINITIALIZED 1
+#define KVM_MP_STATE_INIT_RECEIVED 2
+#define KVM_MP_STATE_HALTED 3
+ int mp_state;
+
+#define MAX_PTC_G_NUM 3
+ int ptc_g_count;
+ struct kvm_ptc_g ptc_g_data[MAX_PTC_G_NUM];
+
+ /*halt timer to wake up sleepy vcpus*/
+ struct hrtimer hlt_timer;
+ long ht_active;
+
+ struct kvm_lapic *apic; /* kernel irqchip context */
+ struct vpd *vpd;
+
+ /* Exit data for vmm_transition*/
+ struct exit_ctl_data exit_data;
+
+ cpumask_t cache_coherent_map;
+
+ unsigned long vmm_rr;
+ unsigned long host_rr6;
+ unsigned long psbits[8];
+ unsigned long cr_iipa;
+ unsigned long cr_isr;
+ unsigned long vsa_base;
+ unsigned long dirty_log_lock_pa;
+ unsigned long __gp;
+ /* TR and TC. */
+ struct thash_data itrs[NITRS];
+ struct thash_data dtrs[NDTRS];
+ /* Bit is set if there is a tr/tc for the region. */
+ unsigned char itr_regions;
+ unsigned char dtr_regions;
+ unsigned char tc_regions;
+ /* purge all */
+ unsigned long ptce_base;
+ unsigned long ptce_count[2];
+ unsigned long ptce_stride[2];
+ /* itc/itm */
+ unsigned long last_itc;
+ long itc_offset;
+ unsigned long itc_check;
+ unsigned long timer_check;
+ unsigned int timer_pending;
+ unsigned int timer_fired;
+
+ unsigned long vrr[8];
+ unsigned long ibr[8];
+ unsigned long dbr[8];
+ unsigned long insvc[4]; /* Interrupt in service. */
+ unsigned long xtp;
+
+ unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */
+ unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */
+ unsigned long metaphysical_saved_rr0; /* from kvm_arch */
+ unsigned long metaphysical_saved_rr4; /* from kvm_arch */
+ unsigned long fp_psr; /*used for lazy float register */
+ unsigned long saved_gp;
+ /*for phycial emulation */
+ int mode_flags;
+ struct thash_cb vtlb;
+ struct thash_cb vhpt;
+ char irq_check;
+ char irq_new_pending;
+
+ unsigned long opcode;
+ unsigned long cause;
+ char log_buf[VMM_LOG_LEN];
+ union context host;
+ union context guest;
+};
+
+struct kvm_vm_stat {
+ u64 remote_tlb_flush;
+};
+
+struct kvm_sal_data {
+ unsigned long boot_ip;
+ unsigned long boot_gp;
+};
+
+struct kvm_arch {
+ spinlock_t dirty_log_lock;
+
+ unsigned long vm_base;
+ unsigned long metaphysical_rr0;
+ unsigned long metaphysical_rr4;
+ unsigned long vmm_init_rr;
+
+ int is_sn2;
+
+ struct kvm_ioapic *vioapic;
+ struct kvm_vm_stat stat;
+ struct kvm_sal_data rdv_sal_data;
+
+ struct list_head assigned_dev_head;
+ struct iommu_domain *iommu_domain;
+ int iommu_flags;
+
+ unsigned long irq_sources_bitmap;
+ unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
+};
+
+union cpuid3_t {
+ u64 value;
+ struct {
+ u64 number : 8;
+ u64 revision : 8;
+ u64 model : 8;
+ u64 family : 8;
+ u64 archrev : 8;
+ u64 rv : 24;
+ };
+};
+
+struct kvm_pt_regs {
+ /* The following registers are saved by SAVE_MIN: */
+ unsigned long b6; /* scratch */
+ unsigned long b7; /* scratch */
+
+ unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */
+ unsigned long ar_ssd; /* reserved for future use (scratch) */
+
+ unsigned long r8; /* scratch (return value register 0) */
+ unsigned long r9; /* scratch (return value register 1) */
+ unsigned long r10; /* scratch (return value register 2) */
+ unsigned long r11; /* scratch (return value register 3) */
+
+ unsigned long cr_ipsr; /* interrupted task's psr */
+ unsigned long cr_iip; /* interrupted task's instruction pointer */
+ unsigned long cr_ifs; /* interrupted task's function state */
+
+ unsigned long ar_unat; /* interrupted task's NaT register (preserved) */
+ unsigned long ar_pfs; /* prev function state */
+ unsigned long ar_rsc; /* RSE configuration */
+ /* The following two are valid only if cr_ipsr.cpl > 0: */
+ unsigned long ar_rnat; /* RSE NaT */
+ unsigned long ar_bspstore; /* RSE bspstore */
+
+ unsigned long pr; /* 64 predicate registers (1 bit each) */
+ unsigned long b0; /* return pointer (bp) */
+ unsigned long loadrs; /* size of dirty partition << 16 */
+
+ unsigned long r1; /* the gp pointer */
+ unsigned long r12; /* interrupted task's memory stack pointer */
+ unsigned long r13; /* thread pointer */
+
+ unsigned long ar_fpsr; /* floating point status (preserved) */
+ unsigned long r15; /* scratch */
+
+ /* The remaining registers are NOT saved for system calls. */
+ unsigned long r14; /* scratch */
+ unsigned long r2; /* scratch */
+ unsigned long r3; /* scratch */
+ unsigned long r16; /* scratch */
+ unsigned long r17; /* scratch */
+ unsigned long r18; /* scratch */
+ unsigned long r19; /* scratch */
+ unsigned long r20; /* scratch */
+ unsigned long r21; /* scratch */
+ unsigned long r22; /* scratch */
+ unsigned long r23; /* scratch */
+ unsigned long r24; /* scratch */
+ unsigned long r25; /* scratch */
+ unsigned long r26; /* scratch */
+ unsigned long r27; /* scratch */
+ unsigned long r28; /* scratch */
+ unsigned long r29; /* scratch */
+ unsigned long r30; /* scratch */
+ unsigned long r31; /* scratch */
+ unsigned long ar_ccv; /* compare/exchange value (scratch) */
+
+ /*
+ * Floating point registers that the kernel considers scratch:
+ */
+ struct ia64_fpreg f6; /* scratch */
+ struct ia64_fpreg f7; /* scratch */
+ struct ia64_fpreg f8; /* scratch */
+ struct ia64_fpreg f9; /* scratch */
+ struct ia64_fpreg f10; /* scratch */
+ struct ia64_fpreg f11; /* scratch */
+
+ unsigned long r4; /* preserved */
+ unsigned long r5; /* preserved */
+ unsigned long r6; /* preserved */
+ unsigned long r7; /* preserved */
+ unsigned long eml_unat; /* used for emulating instruction */
+ unsigned long pad0; /* alignment pad */
+};
+
+static inline struct kvm_pt_regs *vcpu_regs(struct kvm_vcpu *v)
+{
+ return (struct kvm_pt_regs *) ((unsigned long) v + KVM_STK_OFFSET) - 1;
+}
+
+typedef int kvm_vmm_entry(void);
+typedef void kvm_tramp_entry(union context *host, union context *guest);
+
+struct kvm_vmm_info{
+ struct module *module;
+ kvm_vmm_entry *vmm_entry;
+ kvm_tramp_entry *tramp_entry;
+ unsigned long vmm_ivt;
+ unsigned long patch_mov_ar;
+ unsigned long patch_mov_ar_sn2;
+};
+
+int kvm_highest_pending_irq(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+void kvm_sal_emul(struct kvm_vcpu *vcpu);
+
+#define __KVM_HAVE_ARCH_VM_ALLOC 1
+struct kvm *kvm_arch_alloc_vm(void);
+void kvm_arch_free_vm(struct kvm *kvm);
+
+#endif /* __ASSEMBLY__*/
+
+#endif
diff --git a/linux/include/asm-ia64/kvm_para.h b/linux/include/asm-ia64/kvm_para.h
new file mode 100644
index 0000000..2e2f499
--- /dev/null
+++ b/linux/include/asm-ia64/kvm_para.h
@@ -0,0 +1,71 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#ifndef __IA64_KVM_PARA_H
+#define __IA64_KVM_PARA_H
+
+/*
+ * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#ifdef __KERNEL__
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+ return 0;
+}
+
+#endif
+
+#endif
diff --git a/linux/include/asm-x86/hyperv.h b/linux/include/asm-x86/hyperv.h
new file mode 100644
index 0000000..8b44b46
--- /dev/null
+++ b/linux/include/asm-x86/hyperv.h
@@ -0,0 +1,233 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#ifndef _ASM_X86_HYPERV_H
+#define _ASM_X86_HYPERV_H
+
+#include <linux/types.h>
+
+/*
+ * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
+ * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
+ */
+#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
+#define HYPERV_CPUID_INTERFACE 0x40000001
+#define HYPERV_CPUID_VERSION 0x40000002
+#define HYPERV_CPUID_FEATURES 0x40000003
+#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
+#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+
+#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
+#define HYPERV_CPUID_MIN 0x40000005
+#define HYPERV_CPUID_MAX 0x4000ffff
+
+/*
+ * Feature identification. EAX indicates which features are available
+ * to the partition based upon the current partition privileges.
+ */
+
+/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
+#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
+/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
+#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
+/*
+ * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
+ * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
+ */
+#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
+/*
+ * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
+ * HV_X64_MSR_STIMER3_COUNT) available
+ */
+#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
+/*
+ * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
+ * are available
+ */
+#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
+/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
+#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
+/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
+#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
+/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
+#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
+ /*
+ * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
+ */
+#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+
+/*
+ * Feature identification: EBX indicates which flags were specified at
+ * partition creation. The format is the same as the partition creation
+ * flag structure defined in section Partition Creation Flags.
+ */
+#define HV_X64_CREATE_PARTITIONS (1 << 0)
+#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
+#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
+#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
+#define HV_X64_POST_MESSAGES (1 << 4)
+#define HV_X64_SIGNAL_EVENTS (1 << 5)
+#define HV_X64_CREATE_PORT (1 << 6)
+#define HV_X64_CONNECT_PORT (1 << 7)
+#define HV_X64_ACCESS_STATS (1 << 8)
+#define HV_X64_DEBUGGING (1 << 11)
+#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
+#define HV_X64_CONFIGURE_PROFILER (1 << 13)
+
+/*
+ * Feature identification. EDX indicates which miscellaneous features
+ * are available to the partition.
+ */
+/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
+#define HV_X64_MWAIT_AVAILABLE (1 << 0)
+/* Guest debugging support is available */
+#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
+/* Performance Monitor support is available*/
+#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
+/* Support for physical CPU dynamic partitioning events is available*/
+#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
+/*
+ * Support for passing hypercall input parameter block via XMM
+ * registers is available
+ */
+#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
+/* Support for a virtual guest idle state is available */
+#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+
+/*
+ * Implementation recommendations. Indicates which behaviors the hypervisor
+ * recommends the OS implement for optimal performance.
+ */
+ /*
+ * Recommend using hypercall for address space switches rather
+ * than MOV to CR3 instruction
+ */
+#define HV_X64_MWAIT_RECOMMENDED (1 << 0)
+/* Recommend using hypercall for local TLB flushes rather
+ * than INVLPG or MOV to CR3 instructions */
+#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
+/*
+ * Recommend using hypercall for remote TLB flushes rather
+ * than inter-processor interrupts
+ */
+#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
+/*
+ * Recommend using MSRs for accessing APIC registers
+ * EOI, ICR and TPR rather than their memory-mapped counterparts
+ */
+#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
+/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
+#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
+/*
+ * Recommend using relaxed timing for this partition. If used,
+ * the VM should disable any watchdog timeouts that rely on the
+ * timely delivery of external interrupts
+ */
+#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
+
+/* MSR used to identify the guest OS. */
+#define HV_X64_MSR_GUEST_OS_ID 0x40000000
+
+/* MSR used to setup pages used to communicate with the hypervisor. */
+#define HV_X64_MSR_HYPERCALL 0x40000001
+
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX 0x40000002
+
+/* MSR used to read the per-partition time reference counter */
+#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
+
+/* Define the virtual APIC registers */
+#define HV_X64_MSR_EOI 0x40000070
+#define HV_X64_MSR_ICR 0x40000071
+#define HV_X64_MSR_TPR 0x40000072
+#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
+
+/* Define synthetic interrupt controller model specific registers. */
+#define HV_X64_MSR_SCONTROL 0x40000080
+#define HV_X64_MSR_SVERSION 0x40000081
+#define HV_X64_MSR_SIEFP 0x40000082
+#define HV_X64_MSR_SIMP 0x40000083
+#define HV_X64_MSR_EOM 0x40000084
+#define HV_X64_MSR_SINT0 0x40000090
+#define HV_X64_MSR_SINT1 0x40000091
+#define HV_X64_MSR_SINT2 0x40000092
+#define HV_X64_MSR_SINT3 0x40000093
+#define HV_X64_MSR_SINT4 0x40000094
+#define HV_X64_MSR_SINT5 0x40000095
+#define HV_X64_MSR_SINT6 0x40000096
+#define HV_X64_MSR_SINT7 0x40000097
+#define HV_X64_MSR_SINT8 0x40000098
+#define HV_X64_MSR_SINT9 0x40000099
+#define HV_X64_MSR_SINT10 0x4000009A
+#define HV_X64_MSR_SINT11 0x4000009B
+#define HV_X64_MSR_SINT12 0x4000009C
+#define HV_X64_MSR_SINT13 0x4000009D
+#define HV_X64_MSR_SINT14 0x4000009E
+#define HV_X64_MSR_SINT15 0x4000009F
+
+
+#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
+
+/* Declare the various hypercall operations. */
+#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
+
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+#define HV_PROCESSOR_POWER_STATE_C0 0
+#define HV_PROCESSOR_POWER_STATE_C1 1
+#define HV_PROCESSOR_POWER_STATE_C2 2
+#define HV_PROCESSOR_POWER_STATE_C3 3
+
+/* hypercall status code */
+#define HV_STATUS_SUCCESS 0
+#define HV_STATUS_INVALID_HYPERCALL_CODE 2
+#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
+#define HV_STATUS_INVALID_ALIGNMENT 4
+
+#endif
diff --git a/linux/include/asm-x86/kvm.h b/linux/include/asm-x86/kvm.h
new file mode 100644
index 0000000..12ddb51
--- /dev/null
+++ b/linux/include/asm-x86/kvm.h
@@ -0,0 +1,364 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#ifndef _ASM_X86_KVM_H
+#define _ASM_X86_KVM_H
+
+/*
+ * KVM x86 specific structures and definitions
+ *
+ */
+
+#include <asm/types.h>
+#include <linux/ioctl.h>
+
+/* Select x86 specific features in <linux/kvm.h> */
+#define __KVM_HAVE_PIT
+#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_DEVICE_ASSIGNMENT
+#define __KVM_HAVE_MSI
+#define __KVM_HAVE_USER_NMI
+#define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_MSIX
+#define __KVM_HAVE_MCE
+#define __KVM_HAVE_PIT_STATE2
+#define __KVM_HAVE_XEN_HVM
+#define __KVM_HAVE_VCPU_EVENTS
+#define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+struct kvm_memory_alias {
+ __u32 slot; /* this has a different namespace than memory slots */
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size;
+ __u64 target_phys_addr;
+};
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+ __u8 last_irr; /* edge detection */
+ __u8 irr; /* interrupt request register */
+ __u8 imr; /* interrupt mask register */
+ __u8 isr; /* interrupt service register */
+ __u8 priority_add; /* highest irq priority */
+ __u8 irq_base;
+ __u8 read_reg_select;
+ __u8 poll;
+ __u8 special_mask;
+ __u8 init_state;
+ __u8 auto_eoi;
+ __u8 rotate_on_auto_eoi;
+ __u8 special_fully_nested_mode;
+ __u8 init4; /* true if 4 byte init */
+ __u8 elcr; /* PIIX edge/trigger selection */
+ __u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS 24
+struct kvm_ioapic_state {
+ __u64 base_address;
+ __u32 ioregsel;
+ __u32 id;
+ __u32 irr;
+ __u32 pad;
+ union {
+ __u64 bits;
+ struct {
+ __u8 vector;
+ __u8 delivery_mode:3;
+ __u8 dest_mode:1;
+ __u8 delivery_status:1;
+ __u8 polarity:1;
+ __u8 remote_irr:1;
+ __u8 trig_mode:1;
+ __u8 mask:1;
+ __u8 reserve:7;
+ __u8 reserved[4];
+ __u8 dest_id;
+ } fields;
+ } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER 0
+#define KVM_IRQCHIP_PIC_SLAVE 1
+#define KVM_IRQCHIP_IOAPIC 2
+#define KVM_NR_IRQCHIPS 3
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+ __u64 rax, rbx, rcx, rdx;
+ __u64 rsi, rdi, rsp, rbp;
+ __u64 r8, r9, r10, r11;
+ __u64 r12, r13, r14, r15;
+ __u64 rip, rflags;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+ char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+ __u64 base;
+ __u32 limit;
+ __u16 selector;
+ __u8 type;
+ __u8 present, dpl, db, s, l, g, avl;
+ __u8 unusable;
+ __u8 padding;
+};
+
+struct kvm_dtable {
+ __u64 base;
+ __u16 limit;
+ __u16 padding[3];
+};
+
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+ /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+ __u8 fpr[8][16];
+ __u16 fcw;
+ __u16 fsw;
+ __u8 ftwx; /* in fxsave format */
+ __u8 pad1;
+ __u16 last_opcode;
+ __u64 last_ip;
+ __u64 last_dp;
+ __u8 xmm[16][16];
+ __u32 mxcsr;
+ __u32 pad2;
+};
+
+struct kvm_msr_entry {
+ __u32 index;
+ __u32 reserved;
+ __u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 pad;
+
+ struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 indices[0];
+};
+
+
+struct kvm_cpuid_entry {
+ __u32 function;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry entries[0];
+};
+
+struct kvm_cpuid_entry2 {
+ __u32 function;
+ __u32 index;
+ __u32 flags;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding[3];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
+#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
+
+/* for KVM_SET_CPUID2 */
+struct kvm_cpuid2 {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry2 entries[0];
+};
+
+/* for KVM_GET_PIT and KVM_SET_PIT */
+struct kvm_pit_channel_state {
+ __u32 count; /* can be 65536 */
+ __u16 latched_count;
+ __u8 count_latched;
+ __u8 status_latched;
+ __u8 status;
+ __u8 read_state;
+ __u8 write_state;
+ __u8 write_latch;
+ __u8 rw_mode;
+ __u8 mode;
+ __u8 bcd;
+ __u8 gate;
+ __s64 count_load_time;
+};
+
+struct kvm_debug_exit_arch {
+ __u32 exception;
+ __u32 pad;
+ __u64 pc;
+ __u64 dr6;
+ __u64 dr7;
+};
+
+#define KVM_GUESTDBG_USE_SW_BP 0x00010000
+#define KVM_GUESTDBG_USE_HW_BP 0x00020000
+#define KVM_GUESTDBG_INJECT_DB 0x00040000
+#define KVM_GUESTDBG_INJECT_BP 0x00080000
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+ __u64 debugreg[8];
+};
+
+struct kvm_pit_state {
+ struct kvm_pit_channel_state channels[3];
+};
+
+#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
+
+struct kvm_pit_state2 {
+ struct kvm_pit_channel_state channels[3];
+ __u32 flags;
+ __u32 reserved[9];
+};
+
+struct kvm_reinject_control {
+ __u8 pit_reinject;
+ __u8 reserved[31];
+};
+
+/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
+#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
+#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
+#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
+
+/* Interrupt shadow states */
+#define KVM_X86_SHADOW_INT_MOV_SS 0x01
+#define KVM_X86_SHADOW_INT_STI 0x02
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+ struct {
+ __u8 injected;
+ __u8 nr;
+ __u8 has_error_code;
+ __u8 pad;
+ __u32 error_code;
+ } exception;
+ struct {
+ __u8 injected;
+ __u8 nr;
+ __u8 soft;
+ __u8 shadow;
+ } interrupt;
+ struct {
+ __u8 injected;
+ __u8 pending;
+ __u8 masked;
+ __u8 pad;
+ } nmi;
+ __u32 sipi_vector;
+ __u32 flags;
+ __u32 reserved[10];
+};
+
+/* for KVM_GET/SET_DEBUGREGS */
+struct kvm_debugregs {
+ __u64 db[4];
+ __u64 dr6;
+ __u64 dr7;
+ __u64 flags;
+ __u64 reserved[9];
+};
+
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+ __u32 region[1024];
+};
+
+#define KVM_MAX_XCRS 16
+
+struct kvm_xcr {
+ __u32 xcr;
+ __u32 reserved;
+ __u64 value;
+};
+
+struct kvm_xcrs {
+ __u32 nr_xcrs;
+ __u32 flags;
+ struct kvm_xcr xcrs[KVM_MAX_XCRS];
+ __u64 padding[16];
+};
+
+#endif /* _ASM_X86_KVM_H */
diff --git a/linux/include/asm-x86/kvm_emulate.h b/linux/include/asm-x86/kvm_emulate.h
new file mode 100644
index 0000000..a57ee7e
--- /dev/null
+++ b/linux/include/asm-x86/kvm_emulate.h
@@ -0,0 +1,318 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/******************************************************************************
+ * x86_emulate.h
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef _ASM_X86_KVM_X86_EMULATE_H
+#define _ASM_X86_KVM_X86_EMULATE_H
+
+
+
+struct x86_emulate_ctxt;
+
+struct x86_exception {
+ u8 vector;
+ bool error_code_valid;
+ u16 error_code;
+ bool nested_page_fault;
+ u64 address; /* cr2 or nested page fault gpa */
+};
+
+/*
+ * x86_emulate_ops:
+ *
+ * These operations represent the instruction emulator's interface to memory.
+ * There are two categories of operation: those that act on ordinary memory
+ * regions (*_std), and those that act on memory regions known to require
+ * special treatment or emulation (*_emulated).
+ *
+ * The emulator assumes that an instruction accesses only one 'emulated memory'
+ * location, that this location is the given linear faulting address (cr2), and
+ * that this is one of the instruction's data operands. Instruction fetches and
+ * stack operations are assumed never to access emulated memory. The emulator
+ * automatically deduces which operand of a string-move operation is accessing
+ * emulated memory, and assumes that the other operand accesses normal memory.
+ *
+ * NOTES:
+ * 1. The emulator isn't very smart about emulated vs. standard memory.
+ * 'Emulated memory' access addresses should be checked for sanity.
+ * 'Normal memory' accesses may fault, and the caller must arrange to
+ * detect and handle reentrancy into the emulator via recursive faults.
+ * Accesses may be unaligned and may cross page boundaries.
+ * 2. If the access fails (cannot emulate, or a standard access faults) then
+ * it is up to the memop to propagate the fault to the guest VM via
+ * some out-of-band mechanism, unknown to the emulator. The memop signals
+ * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
+ * then immediately bail.
+ * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
+ * cmpxchg8b_emulated need support 8-byte accesses.
+ * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ */
+/* Access completed successfully: continue emulation as normal. */
+#define X86EMUL_CONTINUE 0
+/* Access is unhandleable: bail from emulation and return error to caller. */
+#define X86EMUL_UNHANDLEABLE 1
+/* Terminate emulation but return success to the caller. */
+#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
+#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
+
+struct x86_emulate_ops {
+ /*
+ * read_std: Read bytes of standard (non-emulated/special) memory.
+ * Used for descriptor reading.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_std)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
+
+ /*
+ * write_std: Write bytes of standard (non-emulated/special) memory.
+ * Used for descriptor writing.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_std)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
+ /*
+ * fetch: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*fetch)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
+
+ /*
+ * read_emulated: Read bytes from emulated/special memory area.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_emulated)(unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct x86_exception *fault,
+ struct kvm_vcpu *vcpu);
+
+ /*
+ * write_emulated: Write bytes to emulated/special memory area.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [IN ] Value to write to memory (low-order bytes used as
+ * required).
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_emulated)(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct x86_exception *fault,
+ struct kvm_vcpu *vcpu);
+
+ /*
+ * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
+ * emulated/special memory area.
+ * @addr: [IN ] Linear address to access.
+ * @old: [IN ] Value expected to be current at @addr.
+ * @new: [IN ] Value to write to @addr.
+ * @bytes: [IN ] Number of bytes to access using CMPXCHG.
+ */
+ int (*cmpxchg_emulated)(unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct x86_exception *fault,
+ struct kvm_vcpu *vcpu);
+
+ int (*pio_in_emulated)(int size, unsigned short port, void *val,
+ unsigned int count, struct kvm_vcpu *vcpu);
+
+ int (*pio_out_emulated)(int size, unsigned short port, const void *val,
+ unsigned int count, struct kvm_vcpu *vcpu);
+
+ bool (*get_cached_descriptor)(struct kvm_desc_struct *desc,
+ int seg, struct kvm_vcpu *vcpu);
+ void (*set_cached_descriptor)(struct kvm_desc_struct *desc,
+ int seg, struct kvm_vcpu *vcpu);
+ u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
+ void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+ unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
+ void (*get_gdt)(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu);
+ void (*get_idt)(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu);
+ ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
+ int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+ int (*cpl)(struct kvm_vcpu *vcpu);
+ int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
+ int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
+ int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+ int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+ enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+ unsigned int bytes;
+ union {
+ unsigned long orig_val;
+ u64 orig_val64;
+ };
+ union {
+ unsigned long *reg;
+ struct segmented_address {
+ ulong ea;
+ unsigned seg;
+ } mem;
+ } addr;
+ union {
+ unsigned long val;
+ u64 val64;
+ char valptr[sizeof(unsigned long) + 2];
+ };
+};
+
+struct fetch_cache {
+ u8 data[15];
+ unsigned long start;
+ unsigned long end;
+};
+
+struct read_cache {
+ u8 data[1024];
+ unsigned long pos;
+ unsigned long end;
+};
+
+struct decode_cache {
+ u8 twobyte;
+ u8 b;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ u8 op_bytes;
+ u8 ad_bytes;
+ u8 rex_prefix;
+ struct operand src;
+ struct operand src2;
+ struct operand dst;
+ bool has_seg_override;
+ u8 seg_override;
+ unsigned int d;
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ unsigned long regs[NR_VCPU_REGS];
+ unsigned long eip;
+ /* modrm */
+ u8 modrm;
+ u8 modrm_mod;
+ u8 modrm_reg;
+ u8 modrm_rm;
+ u8 modrm_seg;
+ bool rip_relative;
+ struct fetch_cache fetch;
+ struct read_cache io_read;
+ struct read_cache mem_read;
+};
+
+struct x86_emulate_ctxt {
+ struct x86_emulate_ops *ops;
+
+ /* Register state before/after emulation. */
+ struct kvm_vcpu *vcpu;
+
+ unsigned long eflags;
+ unsigned long eip; /* eip before instruction emulation */
+ /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+ int mode;
+ u32 cs_base;
+
+ /* interruptibility state, as a result of execution of STI or MOV SS */
+ int interruptibility;
+
+ bool perm_ok; /* do not check permissions if true */
+
+ bool have_exception;
+ struct x86_exception exception;
+
+ /* decode cache */
+ struct decode_cache decode;
+};
+
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX 1
+#define REPNE_PREFIX 2
+
+/* Execution mode, passed to the emulator. */
+#define X86EMUL_MODE_REAL 0 /* Real mode. */
+#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
+#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
+#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
+#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
+
+/* Host execution mode. */
+#if defined(CONFIG_X86_32)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
+#elif defined(CONFIG_X86_64)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
+#endif
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
+#define EMULATION_FAILED -1
+#define EMULATION_OK 0
+#define EMULATION_RESTART 1
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code);
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int irq);
+#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/linux/include/asm-x86/kvm_host.h b/linux/include/asm-x86/kvm_host.h
new file mode 100644
index 0000000..0c5e1f7
--- /dev/null
+++ b/linux/include/asm-x86/kvm_host.h
@@ -0,0 +1,888 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This header defines architecture specific interfaces, x86 version
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _ASM_X86_KVM_HOST_H
+#define _ASM_X86_KVM_HOST_H
+
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/mmu_notifier.h>
+#include <linux/tracepoint.h>
+#include <linux/cpumask.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/pvclock-abi.h>
+#include <asm/desc.h>
+#include <asm/mtrr.h>
+#include <asm/msr-index.h>
+
+#define KVM_MAX_VCPUS 64
+#define KVM_MEMORY_SLOTS 32
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+
+#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
+#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
+ 0xFFFFFF0000000000ULL)
+
+#define INVALID_PAGE (~(hpa_t)0)
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+#define UNMAPPED_GVA (~(gpa_t)0)
+
+/* KVM Hugepage definitions for x86 */
+#define KVM_NR_PAGE_SIZES 3
+#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
+#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
+
+#define DE_VECTOR 0
+#define DB_VECTOR 1
+#define BP_VECTOR 3
+#define OF_VECTOR 4
+#define BR_VECTOR 5
+#define UD_VECTOR 6
+#define NM_VECTOR 7
+#define DF_VECTOR 8
+#define TS_VECTOR 10
+#define NP_VECTOR 11
+#define SS_VECTOR 12
+#define GP_VECTOR 13
+#define PF_VECTOR 14
+#define MF_VECTOR 16
+#define MC_VECTOR 18
+
+#define SELECTOR_TI_MASK (1 << 2)
+#define SELECTOR_RPL_MASK 0x03
+
+#define IOPL_SHIFT 12
+
+#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_MMU_HASH_SHIFT 10
+#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 80
+#define KVM_NR_FIXED_MTRR_REGION 88
+#define KVM_NR_VAR_MTRR 8
+
+#define ASYNC_PF_PER_VCPU 64
+
+extern spinlock_t kvm_lock;
+extern struct list_head vm_list;
+
+struct kvm_vcpu;
+struct kvm;
+struct kvm_async_pf;
+
+enum kvm_reg {
+ VCPU_REGS_RAX = 0,
+ VCPU_REGS_RCX = 1,
+ VCPU_REGS_RDX = 2,
+ VCPU_REGS_RBX = 3,
+ VCPU_REGS_RSP = 4,
+ VCPU_REGS_RBP = 5,
+ VCPU_REGS_RSI = 6,
+ VCPU_REGS_RDI = 7,
+#ifdef CONFIG_X86_64
+ VCPU_REGS_R8 = 8,
+ VCPU_REGS_R9 = 9,
+ VCPU_REGS_R10 = 10,
+ VCPU_REGS_R11 = 11,
+ VCPU_REGS_R12 = 12,
+ VCPU_REGS_R13 = 13,
+ VCPU_REGS_R14 = 14,
+ VCPU_REGS_R15 = 15,
+#endif
+ VCPU_REGS_RIP,
+ NR_VCPU_REGS
+};
+
+enum kvm_reg_ex {
+ VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+ VCPU_EXREG_CR3,
+};
+
+enum {
+ VCPU_SREG_ES,
+ VCPU_SREG_CS,
+ VCPU_SREG_SS,
+ VCPU_SREG_DS,
+ VCPU_SREG_FS,
+ VCPU_SREG_GS,
+ VCPU_SREG_TR,
+ VCPU_SREG_LDTR,
+};
+
+#include <asm/kvm_emulate.h>
+
+#define KVM_NR_MEM_OBJS 40
+
+#define KVM_NR_DB_REGS 4
+
+#define DR6_BD (1 << 13)
+#define DR6_BS (1 << 14)
+#define DR6_FIXED_1 0xffff0ff0
+#define DR6_VOLATILE 0x0000e00f
+
+#define DR7_BP_EN_MASK 0x000000ff
+#define DR7_GE (1 << 9)
+#define DR7_GD (1 << 13)
+#define DR7_FIXED_1 0x00000400
+#define DR7_VOLATILE 0xffff23ff
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+ int nobjs;
+ void *objects[KVM_NR_MEM_OBJS];
+};
+
+#define NR_PTE_CHAIN_ENTRIES 5
+
+struct kvm_pte_chain {
+ u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
+ struct hlist_node link;
+};
+
+/*
+ * kvm_mmu_page_role, below, is defined as:
+ *
+ * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
+ * bits 4:7 - page table level for this shadow (1-4)
+ * bits 8:9 - page table quadrant for 2-level guests
+ * bit 16 - direct mapping of virtual to physical mapping at gfn
+ * used for real mode and two-dimensional paging
+ * bits 17:19 - common access permissions for all ptes in this shadow page
+ */
+union kvm_mmu_page_role {
+ unsigned word;
+ struct {
+ unsigned level:4;
+ unsigned cr4_pae:1;
+ unsigned quadrant:2;
+ unsigned pad_for_nice_hex_output:6;
+ unsigned direct:1;
+ unsigned access:3;
+ unsigned invalid:1;
+ unsigned nxe:1;
+ unsigned cr0_wp:1;
+ };
+};
+
+struct kvm_mmu_page {
+ struct list_head link;
+ struct hlist_node hash_link;
+
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ gfn_t gfn;
+ union kvm_mmu_page_role role;
+
+ u64 *spt;
+ /* hold the gfn of each spte inside spt */
+ gfn_t *gfns;
+ /*
+ * One bit set per slot which has memory
+ * in this shadow page.
+ */
+ DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+ bool multimapped; /* More than one parent_pte? */
+ bool unsync;
+ int root_count; /* Currently serving as active root */
+ unsigned int unsync_children;
+ union {
+ u64 *parent_pte; /* !multimapped */
+ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
+ };
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+};
+
+struct kvm_pv_mmu_op_buffer {
+ void *ptr;
+ unsigned len;
+ unsigned processed;
+ char buf[512] __aligned(sizeof(long));
+};
+
+struct kvm_pio_request {
+ unsigned long count;
+ int in;
+ int port;
+ int size;
+};
+
+/*
+ * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
+ * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
+ * mode.
+ */
+struct kvm_mmu {
+ void (*new_cr3)(struct kvm_vcpu *vcpu);
+ void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
+ unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
+ bool prefault);
+ void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
+ void (*free)(struct kvm_vcpu *vcpu);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
+ struct x86_exception *exception);
+ gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
+ void (*prefetch_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page);
+ int (*sync_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp);
+ void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
+ hpa_t root_hpa;
+ int root_level;
+ int shadow_root_level;
+ union kvm_mmu_page_role base_role;
+ bool direct_map;
+
+ u64 *pae_root;
+ u64 *lm_root;
+ u64 rsvd_bits_mask[2][4];
+
+ bool nx;
+
+ u64 pdptrs[4]; /* pae */
+};
+
+struct kvm_vcpu_arch {
+ /*
+ * rip and regs accesses must go through
+ * kvm_{register,rip}_{read,write} functions.
+ */
+ unsigned long regs[NR_VCPU_REGS];
+ u32 regs_avail;
+ u32 regs_dirty;
+
+ unsigned long cr0;
+ unsigned long cr0_guest_owned_bits;
+ unsigned long cr2;
+ unsigned long cr3;
+ unsigned long cr4;
+ unsigned long cr4_guest_owned_bits;
+ unsigned long cr8;
+ u32 hflags;
+ u64 efer;
+ u64 apic_base;
+ struct kvm_lapic *apic; /* kernel irqchip context */
+ int32_t apic_arb_prio;
+ int mp_state;
+ int sipi_vector;
+ u64 ia32_misc_enable_msr;
+ bool tpr_access_reporting;
+
+ /*
+ * Paging state of the vcpu
+ *
+ * If the vcpu runs in guest mode with two level paging this still saves
+ * the paging mode of the l1 guest. This context is always used to
+ * handle faults.
+ */
+ struct kvm_mmu mmu;
+
+ /*
+ * Paging state of an L2 guest (used for nested npt)
+ *
+ * This context will save all necessary information to walk page tables
+ * of the an L2 guest. This context is only initialized for page table
+ * walking and not for faulting since we never handle l2 page faults on
+ * the host.
+ */
+ struct kvm_mmu nested_mmu;
+
+ /*
+ * Pointer to the mmu context currently used for
+ * gva_to_gpa translations.
+ */
+ struct kvm_mmu *walk_mmu;
+
+ /* only needed in kvm_pv_mmu_op() path, but it's hot so
+ * put it here to avoid allocation */
+ struct kvm_pv_mmu_op_buffer mmu_op_buffer;
+
+ struct kvm_mmu_memory_cache mmu_pte_chain_cache;
+ struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+ struct kvm_mmu_memory_cache mmu_page_cache;
+ struct kvm_mmu_memory_cache mmu_page_header_cache;
+
+ gfn_t last_pt_write_gfn;
+ int last_pt_write_count;
+ u64 *last_pte_updated;
+ gfn_t last_pte_gfn;
+
+ struct {
+ gfn_t gfn; /* presumed gfn during guest pte update */
+ pfn_t pfn; /* pfn corresponding to that gfn */
+ unsigned long mmu_seq;
+ } update_pte;
+
+ struct kvm_compat_fpu guest_fpu;
+ u64 xcr0;
+
+ gva_t mmio_fault_cr2;
+ struct kvm_pio_request pio;
+ void *pio_data;
+
+ u8 event_exit_inst_len;
+
+ struct kvm_queued_exception {
+ bool pending;
+ bool has_error_code;
+ bool reinject;
+ u8 nr;
+ u32 error_code;
+ } exception;
+
+ struct kvm_queued_interrupt {
+ bool pending;
+ bool soft;
+ u8 nr;
+ } interrupt;
+
+ int halt_request; /* real mode on Intel only */
+
+ int cpuid_nent;
+ struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+ /* emulate context */
+
+ struct x86_emulate_ctxt emulate_ctxt;
+
+ gpa_t time;
+ struct kvm_pvclock_vcpu_time_info hv_clock;
+ unsigned int hw_tsc_khz;
+ unsigned int time_offset;
+ struct page *time_page;
+ u64 last_host_tsc;
+ u64 last_guest_tsc;
+ u64 last_kernel_ns;
+ u64 last_tsc_nsec;
+ u64 last_tsc_write;
+ bool tsc_catchup;
+
+ bool nmi_pending;
+ bool nmi_injected;
+
+ struct mtrr_state_type mtrr_state;
+ u32 pat;
+
+ int switch_db_regs;
+ unsigned long db[KVM_NR_DB_REGS];
+ unsigned long dr6;
+ unsigned long dr7;
+ unsigned long eff_db[KVM_NR_DB_REGS];
+
+ u64 mcg_cap;
+ u64 mcg_status;
+ u64 mcg_ctl;
+ u64 *mce_banks;
+
+ /* used for guest single stepping over the given code position */
+ unsigned long singlestep_rip;
+
+ /* fields used by HYPER-V emulation */
+ u64 hv_vapic;
+
+ cpumask_var_t wbinvd_dirty_mask;
+
+ struct {
+ bool halted;
+ gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+ struct gfn_to_hva_cache data;
+ u64 msr_val;
+ u32 id;
+ bool send_user_only;
+ } apf;
+};
+
+struct kvm_arch {
+ unsigned int n_used_mmu_pages;
+ unsigned int n_requested_mmu_pages;
+ unsigned int n_max_mmu_pages;
+ atomic_t invlpg_counter;
+ struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+ /*
+ * Hash table of struct kvm_mmu_page.
+ */
+ struct list_head active_mmu_pages;
+ struct list_head assigned_dev_head;
+ struct iommu_domain *iommu_domain;
+ int iommu_flags;
+ struct kvm_pic *vpic;
+ struct kvm_ioapic *vioapic;
+ struct kvm_pit *vpit;
+ int vapics_in_nmi_mode;
+
+ unsigned int tss_addr;
+ struct page *apic_access_page;
+
+ gpa_t wall_clock;
+
+ struct page *ept_identity_pagetable;
+ bool ept_identity_pagetable_done;
+ gpa_t ept_identity_map_addr;
+
+ unsigned long irq_sources_bitmap;
+ s64 kvmclock_offset;
+ spinlock_t tsc_write_lock;
+ u64 last_tsc_nsec;
+ u64 last_tsc_offset;
+ u64 last_tsc_write;
+ u32 virtual_tsc_khz;
+ u32 virtual_tsc_mult;
+ s8 virtual_tsc_shift;
+
+ struct kvm_xen_hvm_config xen_hvm_config;
+
+ /* fields used by HYPER-V emulation */
+ u64 hv_guest_os_id;
+ u64 hv_hypercall;
+
+ #ifdef CONFIG_KVM_MMU_AUDIT
+ int audit_point;
+ #endif
+};
+
+struct kvm_vm_stat {
+ u32 mmu_shadow_zapped;
+ u32 mmu_pte_write;
+ u32 mmu_pte_updated;
+ u32 mmu_pde_zapped;
+ u32 mmu_flooded;
+ u32 mmu_recycled;
+ u32 mmu_cache_miss;
+ u32 mmu_unsync;
+ u32 remote_tlb_flush;
+ u32 lpages;
+};
+
+struct kvm_vcpu_stat {
+ u32 pf_fixed;
+ u32 pf_guest;
+ u32 tlb_flush;
+ u32 invlpg;
+
+ u32 exits;
+ u32 io_exits;
+ u32 mmio_exits;
+ u32 signal_exits;
+ u32 irq_window_exits;
+ u32 nmi_window_exits;
+ u32 halt_exits;
+ u32 halt_wakeup;
+ u32 request_irq_exits;
+ u32 irq_exits;
+ u32 host_state_reload;
+ u32 efer_reload;
+ u32 fpu_reload;
+ u32 insn_emulation;
+ u32 insn_emulation_fail;
+ u32 hypercalls;
+ u32 irq_injections;
+ u32 nmi_injections;
+};
+
+struct kvm_x86_ops {
+ int (*cpu_has_kvm_support)(void); /* __init */
+ int (*disabled_by_bios)(void); /* __init */
+ int (*hardware_enable)(void *dummy);
+ void (*hardware_disable)(void *dummy);
+ void (*check_processor_compatibility)(void *rtn);
+ int (*hardware_setup)(void); /* __init */
+ void (*hardware_unsetup)(void); /* __exit */
+ bool (*cpu_has_accelerated_tpr)(void);
+ void (*cpuid_update)(struct kvm_vcpu *vcpu);
+
+ /* Create, but do not attach this VCPU */
+ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
+ void (*vcpu_free)(struct kvm_vcpu *vcpu);
+ int (*vcpu_reset)(struct kvm_vcpu *vcpu);
+
+ void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+ void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+ void (*vcpu_put)(struct kvm_vcpu *vcpu);
+
+ void (*set_guest_debug)(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg);
+ int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+ int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+ u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
+ void (*get_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ int (*get_cpl)(struct kvm_vcpu *vcpu);
+ void (*set_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*decache_cr3)(struct kvm_vcpu *vcpu);
+ void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+ void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
+ void (*get_idt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt);
+ void (*set_idt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt);
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt);
+ void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+ unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+ void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ void (*fpu_activate)(struct kvm_vcpu *vcpu);
+ void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
+
+ void (*tlb_flush)(struct kvm_vcpu *vcpu);
+
+ void (*run)(struct kvm_vcpu *vcpu);
+ int (*handle_exit)(struct kvm_vcpu *vcpu);
+ void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ void (*patch_hypercall)(struct kvm_vcpu *vcpu,
+ unsigned char *hypercall_addr);
+ void (*set_irq)(struct kvm_vcpu *vcpu);
+ void (*set_nmi)(struct kvm_vcpu *vcpu);
+ void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code,
+ bool reinject);
+ void (*cancel_injection)(struct kvm_vcpu *vcpu);
+ int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
+ int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+ bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
+ void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
+ void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+ void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+ void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+ int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+ int (*get_tdp_level)(void);
+ u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+ int (*get_lpage_level)(void);
+ bool (*rdtscp_supported)(void);
+ void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+
+ void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+
+ void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+
+ bool (*has_wbinvd_exit)(void);
+
+ void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+
+ void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
+ const struct trace_print_flags *exit_reasons_str;
+};
+
+struct kvm_arch_async_pf {
+ u32 token;
+ gfn_t gfn;
+ unsigned long cr3;
+ bool direct_map;
+};
+
+extern struct kvm_x86_ops *kvm_x86_ops;
+
+int kvm_mmu_module_init(void);
+void kvm_mmu_module_exit(void);
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_mmu_create(struct kvm_vcpu *vcpu);
+int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+ u64 dirty_mask, u64 nx_mask, u64 x_mask);
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_zap_all(struct kvm *kvm);
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
+
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes);
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+ gpa_t addr, unsigned long *ret);
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+extern bool tdp_enabled;
+
+enum emulation_result {
+ EMULATE_DONE, /* no further processing */
+ EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
+ EMULATE_FAIL, /* can't emulate this instruction */
+};
+
+#define EMULTYPE_NO_DECODE (1 << 0)
+#define EMULTYPE_TRAP_UD (1 << 1)
+#define EMULTYPE_SKIP (1 << 2)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+ int emulation_type, void *insn, int insn_len);
+
+static inline int emulate_instruction(struct kvm_vcpu *vcpu,
+ int emulation_type)
+{
+ return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+}
+
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+
+void kvm_enable_efer_bits(u64);
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+
+struct x86_emulate_ctxt;
+
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
+int emulate_clts(struct kvm_vcpu *vcpu);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
+
+void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code);
+
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gfn_t gfn, void *data, int offset, int len,
+ u32 access);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
+
+int kvm_pic_set_irq(void *opaque, int irq, int level);
+
+void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+
+int fx_init(struct kvm_vcpu *vcpu);
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes,
+ bool guest_initiated);
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ void *insn, int insn_len);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+
+void kvm_enable_tdp(void);
+void kvm_disable_tdp(void);
+
+int complete_pio(struct kvm_vcpu *vcpu);
+bool kvm_check_iopl(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline u16 kvm_read_ldt(void)
+{
+ u16 ldt;
+ asm("sldt %0" : "=g"(ldt));
+ return ldt;
+}
+
+static inline void kvm_load_ldt(u16 sel)
+{
+ asm("lldt %0" : : "rm"(sel));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long read_msr(unsigned long msr)
+{
+ u64 value;
+
+ rdmsrl(msr, value);
+ return value;
+}
+#endif
+
+static inline u32 get_rdx_init_val(void)
+{
+ return 0x600; /* P6 family */
+}
+
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
+{
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+}
+
+#define TSS_IOPB_BASE_OFFSET 0x66
+#define TSS_BASE_SIZE 0x68
+#define TSS_IOPB_SIZE (65536 / 8)
+#define TSS_REDIRECTION_SIZE (256 / 8)
+#define RMODE_TSS_SIZE \
+ (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+
+enum {
+ TASK_SWITCH_CALL = 0,
+ TASK_SWITCH_IRET = 1,
+ TASK_SWITCH_JMP = 2,
+ TASK_SWITCH_GATE = 3,
+};
+
+#define HF_GIF_MASK (1 << 0)
+#define HF_HIF_MASK (1 << 1)
+#define HF_VINTR_MASK (1 << 2)
+#define HF_NMI_MASK (1 << 3)
+#define HF_IRET_MASK (1 << 4)
+#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+
+/*
+ * Hardware virtualization extension instructions may fault if a
+ * reboot turns off virtualization while processes are running.
+ * Trap the fault and ignore the instruction if that happens.
+ */
+asmlinkage void kvm_spurious_fault(void);
+extern bool kvm_rebooting;
+
+#define __kvm_handle_fault_on_reboot(insn) \
+ "666: " insn "\n\t" \
+ "668: \n\t" \
+ ".pushsection .fixup, \"ax\" \n" \
+ "667: \n\t" \
+ "cmpb $0, kvm_rebooting \n\t" \
+ "jne 668b \n\t" \
+ __ASM_SIZE(push) " $666b \n\t" \
+ "call kvm_spurious_fault \n\t" \
+ ".popsection \n\t" \
+ ".pushsection __ex_table, \"a\" \n\t" \
+ _ASM_PTR " 666b, 667b \n\t" \
+ ".popsection"
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+
+void kvm_define_shared_msr(unsigned index, u32 msr);
+void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+
+#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/linux/include/asm-x86/kvm_para.h b/linux/include/asm-x86/kvm_para.h
new file mode 100644
index 0000000..5f575a4
--- /dev/null
+++ b/linux/include/asm-x86/kvm_para.h
@@ -0,0 +1,233 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#ifndef _ASM_X86_KVM_PARA_H
+#define _ASM_X86_KVM_PARA_H
+
+#include <linux/types.h>
+#include <asm/hyperv.h>
+
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
+ * should be used to determine that a VM is running under KVM.
+ */
+#define KVM_CPUID_SIGNATURE 0x40000000
+
+/* This CPUID returns a feature bitmap in eax. Before enabling a particular
+ * paravirtualization, the appropriate feature bit should be checked.
+ */
+#define KVM_CPUID_FEATURES 0x40000001
+#define KVM_FEATURE_CLOCKSOURCE 0
+#define KVM_FEATURE_NOP_IO_DELAY 1
+#define KVM_FEATURE_MMU_OP 2
+/* This indicates that the new set of kvmclock msrs
+ * are available. The use of 0x11 and 0x12 is deprecated
+ */
+#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_ASYNC_PF 4
+
+/* The last 8 bits are used to indicate how to interpret the flags field
+ * in pvclock structure. If no bits are set, all flags are ignored.
+ */
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
+
+#define MSR_KVM_WALL_CLOCK 0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+
+/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
+#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
+#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+
+#define KVM_MAX_MMU_OP_BATCH 32
+
+#define KVM_ASYNC_PF_ENABLED (1 << 0)
+#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
+
+/* Operations for KVM_HC_MMU_OP */
+#define KVM_MMU_OP_WRITE_PTE 1
+#define KVM_MMU_OP_FLUSH_TLB 2
+#define KVM_MMU_OP_RELEASE_PT 3
+
+/* Payload for KVM_HC_MMU_OP */
+struct kvm_mmu_op_header {
+ __u32 op;
+ __u32 pad;
+};
+
+struct kvm_mmu_op_write_pte {
+ struct kvm_mmu_op_header header;
+ __u64 pte_phys;
+ __u64 pte_val;
+};
+
+struct kvm_mmu_op_flush_tlb {
+ struct kvm_mmu_op_header header;
+};
+
+struct kvm_mmu_op_release_pt {
+ struct kvm_mmu_op_header header;
+ __u64 pt_phys;
+};
+
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
+struct kvm_vcpu_pv_apf_data {
+ __u32 reason;
+ __u8 pad[60];
+ __u32 enabled;
+};
+
+#ifdef __KERNEL__
+#include <asm/processor.h>
+
+extern void kvmclock_init(void);
+extern int kvm_register_clock(char *txt);
+
+
+/* This instruction is vmcall. On non-VT architectures, it will generate a
+ * trap that we will then rewrite to the appropriate instruction.
+ */
+#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
+
+/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+ * instruction. The hypervisor may replace it with something else but only the
+ * instructions are guaranteed to be supported.
+ *
+ * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
+ * The hypercall number should be placed in rax and the return value will be
+ * placed in rax. No other registers will be clobbered unless explicited
+ * noted by the particular hypercall.
+ */
+
+static inline long kvm_hypercall0(unsigned int nr)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
+ unsigned long p2)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3,
+ unsigned long p4)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
+ : "memory");
+ return ret;
+}
+
+static inline int kvm_para_available(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ char signature[13];
+
+ cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
+ memcpy(signature + 0, &ebx, 4);
+ memcpy(signature + 4, &ecx, 4);
+ memcpy(signature + 8, &edx, 4);
+ signature[12] = 0;
+
+ if (strcmp(signature, "KVMKVMKVM") == 0)
+ return 1;
+
+ return 0;
+}
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+ return cpuid_eax(KVM_CPUID_FEATURES);
+}
+
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+void kvm_async_pf_task_wait(u32 token);
+void kvm_async_pf_task_wake(u32 token);
+u32 kvm_read_and_reset_pf_reason(void);
+#else
+#define kvm_guest_init() do { } while (0)
+#define kvm_async_pf_task_wait(T) do {} while(0)
+#define kvm_async_pf_task_wake(T) do {} while(0)
+static inline u32 kvm_read_and_reset_pf_reason(void)
+{
+ return 0;
+}
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/linux/include/asm-x86/svm.h b/linux/include/asm-x86/svm.h
new file mode 100644
index 0000000..5529318
--- /dev/null
+++ b/linux/include/asm-x86/svm.h
@@ -0,0 +1,392 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#ifndef __SVM_H
+#define __SVM_H
+
+enum {
+ INTERCEPT_INTR,
+ INTERCEPT_NMI,
+ INTERCEPT_SMI,
+ INTERCEPT_INIT,
+ INTERCEPT_VINTR,
+ INTERCEPT_SELECTIVE_CR0,
+ INTERCEPT_STORE_IDTR,
+ INTERCEPT_STORE_GDTR,
+ INTERCEPT_STORE_LDTR,
+ INTERCEPT_STORE_TR,
+ INTERCEPT_LOAD_IDTR,
+ INTERCEPT_LOAD_GDTR,
+ INTERCEPT_LOAD_LDTR,
+ INTERCEPT_LOAD_TR,
+ INTERCEPT_RDTSC,
+ INTERCEPT_RDPMC,
+ INTERCEPT_PUSHF,
+ INTERCEPT_POPF,
+ INTERCEPT_CPUID,
+ INTERCEPT_RSM,
+ INTERCEPT_IRET,
+ INTERCEPT_INTn,
+ INTERCEPT_INVD,
+ INTERCEPT_PAUSE,
+ INTERCEPT_HLT,
+ INTERCEPT_INVLPG,
+ INTERCEPT_INVLPGA,
+ INTERCEPT_IOIO_PROT,
+ INTERCEPT_MSR_PROT,
+ INTERCEPT_TASK_SWITCH,
+ INTERCEPT_FERR_FREEZE,
+ INTERCEPT_SHUTDOWN,
+ INTERCEPT_VMRUN,
+ INTERCEPT_VMMCALL,
+ INTERCEPT_VMLOAD,
+ INTERCEPT_VMSAVE,
+ INTERCEPT_STGI,
+ INTERCEPT_CLGI,
+ INTERCEPT_SKINIT,
+ INTERCEPT_RDTSCP,
+ INTERCEPT_ICEBP,
+ INTERCEPT_WBINVD,
+ INTERCEPT_MONITOR,
+ INTERCEPT_MWAIT,
+ INTERCEPT_MWAIT_COND,
+ INTERCEPT_XSETBV,
+};
+
+
+struct __attribute__ ((__packed__)) vmcb_control_area {
+ u32 intercept_cr;
+ u32 intercept_dr;
+ u32 intercept_exceptions;
+ u64 intercept;
+ u8 reserved_1[42];
+ u16 pause_filter_count;
+ u64 iopm_base_pa;
+ u64 msrpm_base_pa;
+ u64 tsc_offset;
+ u32 asid;
+ u8 tlb_ctl;
+ u8 reserved_2[3];
+ u32 int_ctl;
+ u32 int_vector;
+ u32 int_state;
+ u8 reserved_3[4];
+ u32 exit_code;
+ u32 exit_code_hi;
+ u64 exit_info_1;
+ u64 exit_info_2;
+ u32 exit_int_info;
+ u32 exit_int_info_err;
+ u64 nested_ctl;
+ u8 reserved_4[16];
+ u32 event_inj;
+ u32 event_inj_err;
+ u64 nested_cr3;
+ u64 lbr_ctl;
+ u32 clean;
+ u32 reserved_5;
+ u64 next_rip;
+ u8 insn_len;
+ u8 insn_bytes[15];
+ u8 reserved_6[800];
+};
+
+
+#define TLB_CONTROL_DO_NOTHING 0
+#define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
+
+#define V_TPR_MASK 0x0f
+
+#define V_IRQ_SHIFT 8
+#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
+
+#define V_INTR_PRIO_SHIFT 16
+#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
+
+#define V_IGN_TPR_SHIFT 20
+#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+
+#define V_INTR_MASKING_SHIFT 24
+#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
+
+#define SVM_INTERRUPT_SHADOW_MASK 1
+
+#define SVM_IOIO_STR_SHIFT 2
+#define SVM_IOIO_REP_SHIFT 3
+#define SVM_IOIO_SIZE_SHIFT 4
+#define SVM_IOIO_ASIZE_SHIFT 7
+
+#define SVM_IOIO_TYPE_MASK 1
+#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
+#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
+#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
+#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
+
+#define SVM_VM_CR_VALID_MASK 0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
+
+struct __attribute__ ((__packed__)) vmcb_seg {
+ u16 selector;
+ u16 attrib;
+ u32 limit;
+ u64 base;
+};
+
+struct __attribute__ ((__packed__)) vmcb_save_area {
+ struct vmcb_seg es;
+ struct vmcb_seg cs;
+ struct vmcb_seg ss;
+ struct vmcb_seg ds;
+ struct vmcb_seg fs;
+ struct vmcb_seg gs;
+ struct vmcb_seg gdtr;
+ struct vmcb_seg ldtr;
+ struct vmcb_seg idtr;
+ struct vmcb_seg tr;
+ u8 reserved_1[43];
+ u8 cpl;
+ u8 reserved_2[4];
+ u64 efer;
+ u8 reserved_3[112];
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u8 reserved_4[88];
+ u64 rsp;
+ u8 reserved_5[24];
+ u64 rax;
+ u64 star;
+ u64 lstar;
+ u64 cstar;
+ u64 sfmask;
+ u64 kernel_gs_base;
+ u64 sysenter_cs;
+ u64 sysenter_esp;
+ u64 sysenter_eip;
+ u64 cr2;
+ u8 reserved_6[32];
+ u64 g_pat;
+ u64 dbgctl;
+ u64 br_from;
+ u64 br_to;
+ u64 last_excp_from;
+ u64 last_excp_to;
+};
+
+struct __attribute__ ((__packed__)) vmcb {
+ struct vmcb_control_area control;
+ struct vmcb_save_area save;
+};
+
+#define SVM_CPUID_FEATURE_SHIFT 2
+#define SVM_CPUID_FUNC 0x8000000a
+
+#define SVM_VM_CR_SVM_DISABLE 4
+
+#define SVM_SELECTOR_S_SHIFT 4
+#define SVM_SELECTOR_DPL_SHIFT 5
+#define SVM_SELECTOR_P_SHIFT 7
+#define SVM_SELECTOR_AVL_SHIFT 8
+#define SVM_SELECTOR_L_SHIFT 9
+#define SVM_SELECTOR_DB_SHIFT 10
+#define SVM_SELECTOR_G_SHIFT 11
+
+#define SVM_SELECTOR_TYPE_MASK (0xf)
+#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
+#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
+#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
+#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
+#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
+#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
+#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
+
+#define SVM_SELECTOR_WRITE_MASK (1 << 1)
+#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
+#define SVM_SELECTOR_CODE_MASK (1 << 3)
+
+#define INTERCEPT_CR0_READ 0
+#define INTERCEPT_CR3_READ 3
+#define INTERCEPT_CR4_READ 4
+#define INTERCEPT_CR8_READ 8
+#define INTERCEPT_CR0_WRITE (16 + 0)
+#define INTERCEPT_CR3_WRITE (16 + 3)
+#define INTERCEPT_CR4_WRITE (16 + 4)
+#define INTERCEPT_CR8_WRITE (16 + 8)
+
+#define INTERCEPT_DR0_READ 0
+#define INTERCEPT_DR1_READ 1
+#define INTERCEPT_DR2_READ 2
+#define INTERCEPT_DR3_READ 3
+#define INTERCEPT_DR4_READ 4
+#define INTERCEPT_DR5_READ 5
+#define INTERCEPT_DR6_READ 6
+#define INTERCEPT_DR7_READ 7
+#define INTERCEPT_DR0_WRITE (16 + 0)
+#define INTERCEPT_DR1_WRITE (16 + 1)
+#define INTERCEPT_DR2_WRITE (16 + 2)
+#define INTERCEPT_DR3_WRITE (16 + 3)
+#define INTERCEPT_DR4_WRITE (16 + 4)
+#define INTERCEPT_DR5_WRITE (16 + 5)
+#define INTERCEPT_DR6_WRITE (16 + 6)
+#define INTERCEPT_DR7_WRITE (16 + 7)
+
+#define SVM_EVTINJ_VEC_MASK 0xff
+
+#define SVM_EVTINJ_TYPE_SHIFT 8
+#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_VALID (1 << 31)
+#define SVM_EVTINJ_VALID_ERR (1 << 11)
+
+#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
+
+#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
+#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
+#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
+#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
+
+#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
+#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
+
+#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
+#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
+
+#define SVM_EXITINFO_REG_MASK 0x0F
+
+#define SVM_EXIT_READ_CR0 0x000
+#define SVM_EXIT_READ_CR3 0x003
+#define SVM_EXIT_READ_CR4 0x004
+#define SVM_EXIT_READ_CR8 0x008
+#define SVM_EXIT_WRITE_CR0 0x010
+#define SVM_EXIT_WRITE_CR3 0x013
+#define SVM_EXIT_WRITE_CR4 0x014
+#define SVM_EXIT_WRITE_CR8 0x018
+#define SVM_EXIT_READ_DR0 0x020
+#define SVM_EXIT_READ_DR1 0x021
+#define SVM_EXIT_READ_DR2 0x022
+#define SVM_EXIT_READ_DR3 0x023
+#define SVM_EXIT_READ_DR4 0x024
+#define SVM_EXIT_READ_DR5 0x025
+#define SVM_EXIT_READ_DR6 0x026
+#define SVM_EXIT_READ_DR7 0x027
+#define SVM_EXIT_WRITE_DR0 0x030
+#define SVM_EXIT_WRITE_DR1 0x031
+#define SVM_EXIT_WRITE_DR2 0x032
+#define SVM_EXIT_WRITE_DR3 0x033
+#define SVM_EXIT_WRITE_DR4 0x034
+#define SVM_EXIT_WRITE_DR5 0x035
+#define SVM_EXIT_WRITE_DR6 0x036
+#define SVM_EXIT_WRITE_DR7 0x037
+#define SVM_EXIT_EXCP_BASE 0x040
+#define SVM_EXIT_INTR 0x060
+#define SVM_EXIT_NMI 0x061
+#define SVM_EXIT_SMI 0x062
+#define SVM_EXIT_INIT 0x063
+#define SVM_EXIT_VINTR 0x064
+#define SVM_EXIT_CR0_SEL_WRITE 0x065
+#define SVM_EXIT_IDTR_READ 0x066
+#define SVM_EXIT_GDTR_READ 0x067
+#define SVM_EXIT_LDTR_READ 0x068
+#define SVM_EXIT_TR_READ 0x069
+#define SVM_EXIT_IDTR_WRITE 0x06a
+#define SVM_EXIT_GDTR_WRITE 0x06b
+#define SVM_EXIT_LDTR_WRITE 0x06c
+#define SVM_EXIT_TR_WRITE 0x06d
+#define SVM_EXIT_RDTSC 0x06e
+#define SVM_EXIT_RDPMC 0x06f
+#define SVM_EXIT_PUSHF 0x070
+#define SVM_EXIT_POPF 0x071
+#define SVM_EXIT_CPUID 0x072
+#define SVM_EXIT_RSM 0x073
+#define SVM_EXIT_IRET 0x074
+#define SVM_EXIT_SWINT 0x075
+#define SVM_EXIT_INVD 0x076
+#define SVM_EXIT_PAUSE 0x077
+#define SVM_EXIT_HLT 0x078
+#define SVM_EXIT_INVLPG 0x079
+#define SVM_EXIT_INVLPGA 0x07a
+#define SVM_EXIT_IOIO 0x07b
+#define SVM_EXIT_MSR 0x07c
+#define SVM_EXIT_TASK_SWITCH 0x07d
+#define SVM_EXIT_FERR_FREEZE 0x07e
+#define SVM_EXIT_SHUTDOWN 0x07f
+#define SVM_EXIT_VMRUN 0x080
+#define SVM_EXIT_VMMCALL 0x081
+#define SVM_EXIT_VMLOAD 0x082
+#define SVM_EXIT_VMSAVE 0x083
+#define SVM_EXIT_STGI 0x084
+#define SVM_EXIT_CLGI 0x085
+#define SVM_EXIT_SKINIT 0x086
+#define SVM_EXIT_RDTSCP 0x087
+#define SVM_EXIT_ICEBP 0x088
+#define SVM_EXIT_WBINVD 0x089
+#define SVM_EXIT_MONITOR 0x08a
+#define SVM_EXIT_MWAIT 0x08b
+#define SVM_EXIT_MWAIT_COND 0x08c
+#define SVM_EXIT_XSETBV 0x08d
+#define SVM_EXIT_NPF 0x400
+
+#define SVM_EXIT_ERR -1
+
+#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
+
+#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
+#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
+#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
+#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
+#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
+#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
+
+#endif
+
diff --git a/linux/include/asm-x86/virtext.h b/linux/include/asm-x86/virtext.h
new file mode 100644
index 0000000..e6f17fb
--- /dev/null
+++ b/linux/include/asm-x86/virtext.h
@@ -0,0 +1,172 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/* CPU virtualization extensions handling
+ *
+ * This should carry the code for handling CPU virtualization extensions
+ * that needs to live in the kernel core.
+ *
+ * Author: Eduardo Habkost <ehabkost@redhat.com>
+ *
+ * Copyright (C) 2008, Red Hat Inc.
+ *
+ * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+#ifndef _ASM_X86_VIRTEX_H
+#define _ASM_X86_VIRTEX_H
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#include <asm/vmx.h>
+#include <asm/svm.h>
+
+/*
+ * VMX functions:
+ */
+
+static inline int cpu_has_vmx(void)
+{
+ unsigned long ecx = cpuid_ecx(1);
+ return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+
+
+/** Disable VMX on the current CPU
+ *
+ * vmxoff causes a undefined-opcode exception if vmxon was not run
+ * on the CPU previously. Only call this function if you know VMX
+ * is enabled.
+ */
+static inline void cpu_vmxoff(void)
+{
+ asm volatile (ASM_VMX_VMXOFF : : : "cc");
+ write_cr4(read_cr4() & ~X86_CR4_VMXE);
+}
+
+static inline int cpu_vmx_enabled(void)
+{
+ return read_cr4() & X86_CR4_VMXE;
+}
+
+/** Disable VMX if it is enabled on the current CPU
+ *
+ * You shouldn't call this if cpu_has_vmx() returns 0.
+ */
+static inline void __cpu_emergency_vmxoff(void)
+{
+ if (cpu_vmx_enabled())
+ cpu_vmxoff();
+}
+
+/** Disable VMX if it is supported and enabled on the current CPU
+ */
+static inline void cpu_emergency_vmxoff(void)
+{
+ if (cpu_has_vmx())
+ __cpu_emergency_vmxoff();
+}
+
+
+
+
+/*
+ * SVM functions:
+ */
+
+/** Check if the CPU has SVM support
+ *
+ * You can use the 'msg' arg to get a message describing the problem,
+ * if the function returns zero. Simply pass NULL if you are not interested
+ * on the messages; gcc should take care of not generating code for
+ * the messages on this case.
+ */
+static inline int cpu_has_svm(const char **msg)
+{
+ uint32_t eax, ebx, ecx, edx;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+ if (msg)
+ *msg = "not amd";
+ return 0;
+ }
+
+ cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+ if (eax < SVM_CPUID_FUNC) {
+ if (msg)
+ *msg = "can't execute cpuid_8000000a";
+ return 0;
+ }
+
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+ if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+ if (msg)
+ *msg = "svm not available";
+ return 0;
+ }
+ return 1;
+}
+
+
+/** Disable SVM on the current CPU
+ *
+ * You should call this only if cpu_has_svm() returned true.
+ */
+static inline void cpu_svm_disable(void)
+{
+ uint64_t efer;
+
+ wrmsrl(MSR_VM_HSAVE_PA, 0);
+ rdmsrl(MSR_EFER, efer);
+ wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+}
+
+/** Makes sure SVM is disabled, if it is supported on the CPU
+ */
+static inline void cpu_emergency_svm_disable(void)
+{
+ if (cpu_has_svm(NULL))
+ cpu_svm_disable();
+}
+
+#endif /* _ASM_X86_VIRTEX_H */
diff --git a/linux/include/asm-x86/vmx.h b/linux/include/asm-x86/vmx.h
new file mode 100644
index 0000000..0174fa5
--- /dev/null
+++ b/linux/include/asm-x86/vmx.h
@@ -0,0 +1,469 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#ifndef VMX_H
+#define VMX_H
+
+/*
+ * vmx.h: VMX Architecture related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * A few random additions are:
+ * Copyright (C) 2006 Qumranet
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ */
+
+#include <linux/types.h>
+
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
+#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
+#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
+#define CPU_BASED_HLT_EXITING 0x00000080
+#define CPU_BASED_INVLPG_EXITING 0x00000200
+#define CPU_BASED_MWAIT_EXITING 0x00000400
+#define CPU_BASED_RDPMC_EXITING 0x00000800
+#define CPU_BASED_RDTSC_EXITING 0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING 0x00008000
+#define CPU_BASED_CR3_STORE_EXITING 0x00010000
+#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
+#define CPU_BASED_CR8_STORE_EXITING 0x00100000
+#define CPU_BASED_TPR_SHADOW 0x00200000
+#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000
+#define CPU_BASED_MOV_DR_EXITING 0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
+#define CPU_BASED_USE_IO_BITMAPS 0x02000000
+#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
+#define CPU_BASED_MONITOR_EXITING 0x20000000
+#define CPU_BASED_PAUSE_EXITING 0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
+#define SECONDARY_EXEC_RDTSCP 0x00000008
+#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
+#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
+
+
+#define PIN_BASED_EXT_INTR_MASK 0x00000001
+#define PIN_BASED_NMI_EXITING 0x00000008
+#define PIN_BASED_VIRTUAL_NMIS 0x00000020
+
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
+#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
+#define VM_EXIT_SAVE_IA32_PAT 0x00040000
+#define VM_EXIT_LOAD_IA32_PAT 0x00080000
+#define VM_EXIT_SAVE_IA32_EFER 0x00100000
+#define VM_EXIT_LOAD_IA32_EFER 0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
+
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
+#define VM_ENTRY_IA32E_MODE 0x00000200
+#define VM_ENTRY_SMM 0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
+#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
+
+/* VMCS Encodings */
+enum vmcs_field {
+ VIRTUAL_PROCESSOR_ID = 0x00000000,
+ GUEST_ES_SELECTOR = 0x00000800,
+ GUEST_CS_SELECTOR = 0x00000802,
+ GUEST_SS_SELECTOR = 0x00000804,
+ GUEST_DS_SELECTOR = 0x00000806,
+ GUEST_FS_SELECTOR = 0x00000808,
+ GUEST_GS_SELECTOR = 0x0000080a,
+ GUEST_LDTR_SELECTOR = 0x0000080c,
+ GUEST_TR_SELECTOR = 0x0000080e,
+ HOST_ES_SELECTOR = 0x00000c00,
+ HOST_CS_SELECTOR = 0x00000c02,
+ HOST_SS_SELECTOR = 0x00000c04,
+ HOST_DS_SELECTOR = 0x00000c06,
+ HOST_FS_SELECTOR = 0x00000c08,
+ HOST_GS_SELECTOR = 0x00000c0a,
+ HOST_TR_SELECTOR = 0x00000c0c,
+ IO_BITMAP_A = 0x00002000,
+ IO_BITMAP_A_HIGH = 0x00002001,
+ IO_BITMAP_B = 0x00002002,
+ IO_BITMAP_B_HIGH = 0x00002003,
+ MSR_BITMAP = 0x00002004,
+ MSR_BITMAP_HIGH = 0x00002005,
+ VM_EXIT_MSR_STORE_ADDR = 0x00002006,
+ VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
+ VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
+ VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
+ VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
+ VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
+ TSC_OFFSET = 0x00002010,
+ TSC_OFFSET_HIGH = 0x00002011,
+ VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
+ VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
+ APIC_ACCESS_ADDR = 0x00002014,
+ APIC_ACCESS_ADDR_HIGH = 0x00002015,
+ EPT_POINTER = 0x0000201a,
+ EPT_POINTER_HIGH = 0x0000201b,
+ GUEST_PHYSICAL_ADDRESS = 0x00002400,
+ GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
+ VMCS_LINK_POINTER = 0x00002800,
+ VMCS_LINK_POINTER_HIGH = 0x00002801,
+ GUEST_IA32_DEBUGCTL = 0x00002802,
+ GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
+ GUEST_IA32_PAT = 0x00002804,
+ GUEST_IA32_PAT_HIGH = 0x00002805,
+ GUEST_IA32_EFER = 0x00002806,
+ GUEST_IA32_EFER_HIGH = 0x00002807,
+ GUEST_PDPTR0 = 0x0000280a,
+ GUEST_PDPTR0_HIGH = 0x0000280b,
+ GUEST_PDPTR1 = 0x0000280c,
+ GUEST_PDPTR1_HIGH = 0x0000280d,
+ GUEST_PDPTR2 = 0x0000280e,
+ GUEST_PDPTR2_HIGH = 0x0000280f,
+ GUEST_PDPTR3 = 0x00002810,
+ GUEST_PDPTR3_HIGH = 0x00002811,
+ HOST_IA32_PAT = 0x00002c00,
+ HOST_IA32_PAT_HIGH = 0x00002c01,
+ HOST_IA32_EFER = 0x00002c02,
+ HOST_IA32_EFER_HIGH = 0x00002c03,
+ PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
+ CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
+ EXCEPTION_BITMAP = 0x00004004,
+ PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
+ PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
+ CR3_TARGET_COUNT = 0x0000400a,
+ VM_EXIT_CONTROLS = 0x0000400c,
+ VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
+ VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
+ VM_ENTRY_CONTROLS = 0x00004012,
+ VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
+ VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
+ VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
+ VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
+ TPR_THRESHOLD = 0x0000401c,
+ SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
+ PLE_GAP = 0x00004020,
+ PLE_WINDOW = 0x00004022,
+ VM_INSTRUCTION_ERROR = 0x00004400,
+ VM_EXIT_REASON = 0x00004402,
+ VM_EXIT_INTR_INFO = 0x00004404,
+ VM_EXIT_INTR_ERROR_CODE = 0x00004406,
+ IDT_VECTORING_INFO_FIELD = 0x00004408,
+ IDT_VECTORING_ERROR_CODE = 0x0000440a,
+ VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
+ VMX_INSTRUCTION_INFO = 0x0000440e,
+ GUEST_ES_LIMIT = 0x00004800,
+ GUEST_CS_LIMIT = 0x00004802,
+ GUEST_SS_LIMIT = 0x00004804,
+ GUEST_DS_LIMIT = 0x00004806,
+ GUEST_FS_LIMIT = 0x00004808,
+ GUEST_GS_LIMIT = 0x0000480a,
+ GUEST_LDTR_LIMIT = 0x0000480c,
+ GUEST_TR_LIMIT = 0x0000480e,
+ GUEST_GDTR_LIMIT = 0x00004810,
+ GUEST_IDTR_LIMIT = 0x00004812,
+ GUEST_ES_AR_BYTES = 0x00004814,
+ GUEST_CS_AR_BYTES = 0x00004816,
+ GUEST_SS_AR_BYTES = 0x00004818,
+ GUEST_DS_AR_BYTES = 0x0000481a,
+ GUEST_FS_AR_BYTES = 0x0000481c,
+ GUEST_GS_AR_BYTES = 0x0000481e,
+ GUEST_LDTR_AR_BYTES = 0x00004820,
+ GUEST_TR_AR_BYTES = 0x00004822,
+ GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
+ GUEST_ACTIVITY_STATE = 0X00004826,
+ GUEST_SYSENTER_CS = 0x0000482A,
+ HOST_IA32_SYSENTER_CS = 0x00004c00,
+ CR0_GUEST_HOST_MASK = 0x00006000,
+ CR4_GUEST_HOST_MASK = 0x00006002,
+ CR0_READ_SHADOW = 0x00006004,
+ CR4_READ_SHADOW = 0x00006006,
+ CR3_TARGET_VALUE0 = 0x00006008,
+ CR3_TARGET_VALUE1 = 0x0000600a,
+ CR3_TARGET_VALUE2 = 0x0000600c,
+ CR3_TARGET_VALUE3 = 0x0000600e,
+ EXIT_QUALIFICATION = 0x00006400,
+ GUEST_LINEAR_ADDRESS = 0x0000640a,
+ GUEST_CR0 = 0x00006800,
+ GUEST_CR3 = 0x00006802,
+ GUEST_CR4 = 0x00006804,
+ GUEST_ES_BASE = 0x00006806,
+ GUEST_CS_BASE = 0x00006808,
+ GUEST_SS_BASE = 0x0000680a,
+ GUEST_DS_BASE = 0x0000680c,
+ GUEST_FS_BASE = 0x0000680e,
+ GUEST_GS_BASE = 0x00006810,
+ GUEST_LDTR_BASE = 0x00006812,
+ GUEST_TR_BASE = 0x00006814,
+ GUEST_GDTR_BASE = 0x00006816,
+ GUEST_IDTR_BASE = 0x00006818,
+ GUEST_DR7 = 0x0000681a,
+ GUEST_RSP = 0x0000681c,
+ GUEST_RIP = 0x0000681e,
+ GUEST_RFLAGS = 0x00006820,
+ GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
+ GUEST_SYSENTER_ESP = 0x00006824,
+ GUEST_SYSENTER_EIP = 0x00006826,
+ HOST_CR0 = 0x00006c00,
+ HOST_CR3 = 0x00006c02,
+ HOST_CR4 = 0x00006c04,
+ HOST_FS_BASE = 0x00006c06,
+ HOST_GS_BASE = 0x00006c08,
+ HOST_TR_BASE = 0x00006c0a,
+ HOST_GDTR_BASE = 0x00006c0c,
+ HOST_IDTR_BASE = 0x00006c0e,
+ HOST_IA32_SYSENTER_ESP = 0x00006c10,
+ HOST_IA32_SYSENTER_EIP = 0x00006c12,
+ HOST_RSP = 0x00006c14,
+ HOST_RIP = 0x00006c16,
+};
+
+#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+
+#define EXIT_REASON_EXCEPTION_NMI 0
+#define EXIT_REASON_EXTERNAL_INTERRUPT 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+
+#define EXIT_REASON_PENDING_INTERRUPT 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMOFF 26
+#define EXIT_REASON_VMON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_IO_INSTRUCTION 30
+#define EXIT_REASON_MSR_READ 31
+#define EXIT_REASON_MSR_WRITE 32
+#define EXIT_REASON_INVALID_STATE 33
+#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION 40
+#define EXIT_REASON_MCE_DURING_VMENTRY 41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EPT_VIOLATION 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+
+/*
+ * Interruption-information format
+ */
+#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
+#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
+#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
+#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */
+#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
+#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000
+
+#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
+#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
+#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK
+#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
+
+#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
+#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */
+#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */
+#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
+#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */
+
+/* GUEST_INTERRUPTIBILITY_INFO flags. */
+#define GUEST_INTR_STATE_STI 0x00000001
+#define GUEST_INTR_STATE_MOV_SS 0x00000002
+#define GUEST_INTR_STATE_SMI 0x00000004
+#define GUEST_INTR_STATE_NMI 0x00000008
+
+/* GUEST_ACTIVITY_STATE flags */
+#define GUEST_ACTIVITY_ACTIVE 0
+#define GUEST_ACTIVITY_HLT 1
+#define GUEST_ACTIVITY_SHUTDOWN 2
+#define GUEST_ACTIVITY_WAIT_SIPI 3
+
+/*
+ * Exit Qualifications for MOV for Control Register Access
+ */
+#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
+#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
+#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
+#define LMSW_SOURCE_DATA_SHIFT 16
+#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
+#define REG_EAX (0 << 8)
+#define REG_ECX (1 << 8)
+#define REG_EDX (2 << 8)
+#define REG_EBX (3 << 8)
+#define REG_ESP (4 << 8)
+#define REG_EBP (5 << 8)
+#define REG_ESI (6 << 8)
+#define REG_EDI (7 << 8)
+#define REG_R8 (8 << 8)
+#define REG_R9 (9 << 8)
+#define REG_R10 (10 << 8)
+#define REG_R11 (11 << 8)
+#define REG_R12 (12 << 8)
+#define REG_R13 (13 << 8)
+#define REG_R14 (14 << 8)
+#define REG_R15 (15 << 8)
+
+/*
+ * Exit Qualifications for MOV for Debug Register Access
+ */
+#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
+#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
+#define TYPE_MOV_TO_DR (0 << 4)
+#define TYPE_MOV_FROM_DR (1 << 4)
+#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
+
+
+/* segment AR */
+#define SEGMENT_AR_L_MASK (1 << 13)
+
+#define AR_TYPE_ACCESSES_MASK 1
+#define AR_TYPE_READABLE_MASK (1 << 1)
+#define AR_TYPE_WRITEABLE_MASK (1 << 2)
+#define AR_TYPE_CODE_MASK (1 << 3)
+#define AR_TYPE_MASK 0x0f
+#define AR_TYPE_BUSY_64_TSS 11
+#define AR_TYPE_BUSY_32_TSS 11
+#define AR_TYPE_BUSY_16_TSS 3
+#define AR_TYPE_LDT 2
+
+#define AR_UNUSABLE_MASK (1 << 16)
+#define AR_S_MASK (1 << 4)
+#define AR_P_MASK (1 << 7)
+#define AR_L_MASK (1 << 13)
+#define AR_DB_MASK (1 << 14)
+#define AR_G_MASK (1 << 15)
+#define AR_DPL_SHIFT 5
+#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
+
+#define AR_RESERVD_MASK 0xfffe0f00
+
+#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2)
+
+#define VMX_NR_VPIDS (1 << 16)
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
+#define VMX_VPID_EXTENT_ALL_CONTEXT 2
+
+#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
+#define VMX_EPT_EXTENT_CONTEXT 1
+#define VMX_EPT_EXTENT_GLOBAL 2
+
+#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
+#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
+#define VMX_EPTP_UC_BIT (1ull << 8)
+#define VMX_EPTP_WB_BIT (1ull << 14)
+#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
+#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
+#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
+#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
+
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
+#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
+
+#define VMX_EPT_DEFAULT_GAW 3
+#define VMX_EPT_MAX_GAW 0x4
+#define VMX_EPT_MT_EPTE_SHIFT 3
+#define VMX_EPT_GAW_EPTP_SHIFT 3
+#define VMX_EPT_DEFAULT_MT 0x6ull
+#define VMX_EPT_READABLE_MASK 0x1ull
+#define VMX_EPT_WRITABLE_MASK 0x2ull
+#define VMX_EPT_EXECUTABLE_MASK 0x4ull
+#define VMX_EPT_IPAT_BIT (1ull << 6)
+
+#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
+
+
+#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+
+struct vmx_msr_entry {
+ u32 index;
+ u32 reserved;
+ u64 value;
+} __aligned(16);
+
+#endif
diff --git a/linux/include/linux/kvm.h b/linux/include/linux/kvm.h
new file mode 100644
index 0000000..88fccc0
--- /dev/null
+++ b/linux/include/linux/kvm.h
@@ -0,0 +1,838 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#ifndef __LINUX_KVM_H
+#define __LINUX_KVM_H
+
+/*
+ * Userspace interface for /dev/kvm - kernel based virtual machine
+ *
+ * Note: you must update KVM_API_VERSION if you change this interface.
+ */
+
+#include <asm/types.h>
+
+#include <linux/ioctl.h>
+#include <asm/kvm.h>
+
+#define KVM_API_VERSION 12
+
+/* *** Deprecated interfaces *** */
+
+#define KVM_TRC_SHIFT 16
+
+#define KVM_TRC_ENTRYEXIT (1 << KVM_TRC_SHIFT)
+#define KVM_TRC_HANDLER (1 << (KVM_TRC_SHIFT + 1))
+
+#define KVM_TRC_VMENTRY (KVM_TRC_ENTRYEXIT + 0x01)
+#define KVM_TRC_VMEXIT (KVM_TRC_ENTRYEXIT + 0x02)
+#define KVM_TRC_PAGE_FAULT (KVM_TRC_HANDLER + 0x01)
+
+#define KVM_TRC_HEAD_SIZE 12
+#define KVM_TRC_CYCLE_SIZE 8
+#define KVM_TRC_EXTRA_MAX 7
+
+#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02)
+#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03)
+#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04)
+#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05)
+#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06)
+#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07)
+#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08)
+#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09)
+#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A)
+#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B)
+#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C)
+#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D)
+#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E)
+#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F)
+#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10)
+#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11)
+#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12)
+#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13)
+#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14)
+#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15)
+#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16)
+#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17)
+#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18)
+#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19)
+
+struct kvm_user_trace_setup {
+ __u32 buf_size;
+ __u32 buf_nr;
+};
+
+#define __KVM_DEPRECATED_MAIN_W_0x06 \
+ _IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
+#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07)
+#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08)
+
+#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq)
+
+struct kvm_breakpoint {
+ __u32 enabled;
+ __u32 padding;
+ __u64 address;
+};
+
+struct kvm_debug_guest {
+ __u32 enabled;
+ __u32 pad;
+ struct kvm_breakpoint breakpoints[4];
+ __u32 singlestep;
+};
+
+#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest)
+
+/* *** End of deprecated interfaces *** */
+
+
+/* for KVM_CREATE_MEMORY_REGION */
+struct kvm_memory_region {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+};
+
+/* for KVM_SET_USER_MEMORY_REGION */
+struct kvm_userspace_memory_region {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+ __u64 userspace_addr; /* start of the userspace allocated memory */
+};
+
+/* for kvm_memory_region::flags */
+#define KVM_MEM_LOG_DIRTY_PAGES 1UL
+#define KVM_MEMSLOT_INVALID (1UL << 1)
+
+/* for KVM_IRQ_LINE */
+struct kvm_irq_level {
+ /*
+ * ACPI gsi notion of irq.
+ * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
+ * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
+ */
+ union {
+ __u32 irq;
+ __s32 status;
+ };
+ __u32 level;
+};
+
+
+struct kvm_irqchip {
+ __u32 chip_id;
+ __u32 pad;
+ union {
+ char dummy[512]; /* reserving space */
+#ifdef __KVM_HAVE_PIT
+ struct kvm_pic_state pic;
+#endif
+#ifdef __KVM_HAVE_IOAPIC
+ struct kvm_ioapic_state ioapic;
+#endif
+ } chip;
+};
+
+/* for KVM_CREATE_PIT2 */
+struct kvm_pit_config {
+ __u32 flags;
+ __u32 pad[15];
+};
+
+#define KVM_PIT_SPEAKER_DUMMY 1
+
+#define KVM_EXIT_UNKNOWN 0
+#define KVM_EXIT_EXCEPTION 1
+#define KVM_EXIT_IO 2
+#define KVM_EXIT_HYPERCALL 3
+#define KVM_EXIT_DEBUG 4
+#define KVM_EXIT_HLT 5
+#define KVM_EXIT_MMIO 6
+#define KVM_EXIT_IRQ_WINDOW_OPEN 7
+#define KVM_EXIT_SHUTDOWN 8
+#define KVM_EXIT_FAIL_ENTRY 9
+#define KVM_EXIT_INTR 10
+#define KVM_EXIT_SET_TPR 11
+#define KVM_EXIT_TPR_ACCESS 12
+#define KVM_EXIT_S390_SIEIC 13
+#define KVM_EXIT_S390_RESET 14
+#define KVM_EXIT_DCR 15
+#define KVM_EXIT_NMI 16
+#define KVM_EXIT_INTERNAL_ERROR 17
+#define KVM_EXIT_OSI 18
+
+/* For KVM_EXIT_INTERNAL_ERROR */
+#define KVM_INTERNAL_ERROR_EMULATION 1
+#define KVM_INTERNAL_ERROR_SIMUL_EX 2
+
+/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
+struct kvm_run {
+ /* in */
+ __u8 request_interrupt_window;
+ __u8 padding1[7];
+
+ /* out */
+ __u32 exit_reason;
+ __u8 ready_for_interrupt_injection;
+ __u8 if_flag;
+ __u8 padding2[2];
+
+ /* in (pre_kvm_run), out (post_kvm_run) */
+ __u64 cr8;
+ __u64 apic_base;
+
+#ifdef __KVM_S390
+ /* the processor status word for s390 */
+ __u64 psw_mask; /* psw upper half */
+ __u64 psw_addr; /* psw lower half */
+#endif
+ union {
+ /* KVM_EXIT_UNKNOWN */
+ struct {
+ __u64 hardware_exit_reason;
+ } hw;
+ /* KVM_EXIT_FAIL_ENTRY */
+ struct {
+ __u64 hardware_entry_failure_reason;
+ } fail_entry;
+ /* KVM_EXIT_EXCEPTION */
+ struct {
+ __u32 exception;
+ __u32 error_code;
+ } ex;
+ /* KVM_EXIT_IO */
+ struct {
+#define KVM_EXIT_IO_IN 0
+#define KVM_EXIT_IO_OUT 1
+ __u8 direction;
+ __u8 size; /* bytes */
+ __u16 port;
+ __u32 count;
+ __u64 data_offset; /* relative to kvm_run start */
+ } io;
+ struct {
+ struct kvm_debug_exit_arch arch;
+ } debug;
+ /* KVM_EXIT_MMIO */
+ struct {
+ __u64 phys_addr;
+ __u8 data[8];
+ __u32 len;
+ __u8 is_write;
+ } mmio;
+ /* KVM_EXIT_HYPERCALL */
+ struct {
+ __u64 nr;
+ __u64 args[6];
+ __u64 ret;
+ __u32 longmode;
+ __u32 pad;
+ } hypercall;
+ /* KVM_EXIT_TPR_ACCESS */
+ struct {
+ __u64 rip;
+ __u32 is_write;
+ __u32 pad;
+ } tpr_access;
+ /* KVM_EXIT_S390_SIEIC */
+ struct {
+ __u8 icptcode;
+ __u16 ipa;
+ __u32 ipb;
+ } s390_sieic;
+ /* KVM_EXIT_S390_RESET */
+#define KVM_S390_RESET_POR 1
+#define KVM_S390_RESET_CLEAR 2
+#define KVM_S390_RESET_SUBSYSTEM 4
+#define KVM_S390_RESET_CPU_INIT 8
+#define KVM_S390_RESET_IPL 16
+ __u64 s390_reset_flags;
+ /* KVM_EXIT_DCR */
+ struct {
+ __u32 dcrn;
+ __u32 data;
+ __u8 is_write;
+ } dcr;
+ struct {
+ __u32 suberror;
+ /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
+ __u32 ndata;
+ __u64 data[16];
+ } internal;
+ /* KVM_EXIT_OSI */
+ struct {
+ __u64 gprs[32];
+ } osi;
+ /* Fix the size of the union. */
+ char padding[256];
+ };
+};
+
+/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
+
+struct kvm_coalesced_mmio_zone {
+ __u64 addr;
+ __u32 size;
+ __u32 pad;
+};
+
+struct kvm_coalesced_mmio {
+ __u64 phys_addr;
+ __u32 len;
+ __u32 pad;
+ __u8 data[8];
+};
+
+struct kvm_coalesced_mmio_ring {
+ __u32 first, last;
+ struct kvm_coalesced_mmio coalesced_mmio[0];
+};
+
+#define KVM_COALESCED_MMIO_MAX \
+ ((PAGE_SIZE - sizeof(struct kvm_coalesced_mmio_ring)) / \
+ sizeof(struct kvm_coalesced_mmio))
+
+/* for KVM_TRANSLATE */
+struct kvm_translation {
+ /* in */
+ __u64 linear_address;
+
+ /* out */
+ __u64 physical_address;
+ __u8 valid;
+ __u8 writeable;
+ __u8 usermode;
+ __u8 pad[5];
+};
+
+/* for KVM_INTERRUPT */
+struct kvm_interrupt {
+ /* in */
+ __u32 irq;
+};
+
+/* for KVM_GET_DIRTY_LOG */
+struct kvm_dirty_log {
+ __u32 slot;
+ __u32 padding1;
+ union {
+ void *dirty_bitmap; /* one bit per page */
+ __u64 padding2;
+ };
+};
+
+/* for KVM_SET_SIGNAL_MASK */
+struct kvm_signal_mask {
+ __u32 len;
+ __u8 sigset[0];
+};
+
+/* for KVM_TPR_ACCESS_REPORTING */
+struct kvm_tpr_access_ctl {
+ __u32 enabled;
+ __u32 flags;
+ __u32 reserved[8];
+};
+
+/* for KVM_SET_VAPIC_ADDR */
+struct kvm_vapic_addr {
+ __u64 vapic_addr;
+};
+
+/* for KVM_SET_MPSTATE */
+
+#define KVM_MP_STATE_RUNNABLE 0
+#define KVM_MP_STATE_UNINITIALIZED 1
+#define KVM_MP_STATE_INIT_RECEIVED 2
+#define KVM_MP_STATE_HALTED 3
+#define KVM_MP_STATE_SIPI_RECEIVED 4
+
+struct kvm_mp_state {
+ __u32 mp_state;
+};
+
+struct kvm_s390_psw {
+ __u64 mask;
+ __u64 addr;
+};
+
+/* valid values for type in kvm_s390_interrupt */
+#define KVM_S390_SIGP_STOP 0xfffe0000u
+#define KVM_S390_PROGRAM_INT 0xfffe0001u
+#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u
+#define KVM_S390_RESTART 0xfffe0003u
+#define KVM_S390_INT_VIRTIO 0xffff2603u
+#define KVM_S390_INT_SERVICE 0xffff2401u
+#define KVM_S390_INT_EMERGENCY 0xffff1201u
+
+struct kvm_s390_interrupt {
+ __u32 type;
+ __u32 parm;
+ __u64 parm64;
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+
+#define KVM_GUESTDBG_ENABLE 0x00000001
+#define KVM_GUESTDBG_SINGLESTEP 0x00000002
+
+struct kvm_guest_debug {
+ __u32 control;
+ __u32 pad;
+ struct kvm_guest_debug_arch arch;
+};
+
+enum {
+ kvm_ioeventfd_flag_nr_datamatch,
+ kvm_ioeventfd_flag_nr_pio,
+ kvm_ioeventfd_flag_nr_deassign,
+ kvm_ioeventfd_flag_nr_max,
+};
+
+#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
+#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio)
+#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
+
+#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
+
+struct kvm_ioeventfd {
+ __u64 datamatch;
+ __u64 addr; /* legal pio/mmio address */
+ __u32 len; /* 1, 2, 4, or 8 bytes */
+ __s32 fd;
+ __u32 flags;
+ __u8 pad[36];
+};
+
+/* for KVM_ENABLE_CAP */
+struct kvm_enable_cap {
+ /* in */
+ __u32 cap;
+ __u32 flags;
+ __u64 args[4];
+ __u8 pad[64];
+};
+
+/* for KVM_PPC_GET_PVINFO */
+struct kvm_ppc_pvinfo {
+ /* out */
+ __u32 flags;
+ __u32 hcall[4];
+ __u8 pad[108];
+};
+
+#define KVMIO 0xAE
+
+/*
+ * ioctls for /dev/kvm fds:
+ */
+#define KVM_GET_API_VERSION _IO(KVMIO, 0x00)
+#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */
+#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 0x02, struct kvm_msr_list)
+
+#define KVM_S390_ENABLE_SIE _IO(KVMIO, 0x06)
+/*
+ * Check if a kvm extension is available. Argument is extension number,
+ * return is 1 (yes) or 0 (no, sorry).
+ */
+#define KVM_CHECK_EXTENSION _IO(KVMIO, 0x03)
+/*
+ * Get size for mmap(vcpu_fd)
+ */
+#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */
+#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid2)
+#define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06
+#define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07
+#define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08
+
+/*
+ * Extension capability list.
+ */
+#define KVM_CAP_IRQCHIP 0
+#define KVM_CAP_HLT 1
+#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
+#define KVM_CAP_USER_MEMORY 3
+#define KVM_CAP_SET_TSS_ADDR 4
+#define KVM_CAP_VAPIC 6
+#define KVM_CAP_EXT_CPUID 7
+#define KVM_CAP_CLOCKSOURCE 8
+#define KVM_CAP_NR_VCPUS 9 /* returns max vcpus per vm */
+#define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */
+#define KVM_CAP_PIT 11
+#define KVM_CAP_NOP_IO_DELAY 12
+#define KVM_CAP_PV_MMU 13
+#define KVM_CAP_MP_STATE 14
+#define KVM_CAP_COALESCED_MMIO 15
+#define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+#define KVM_CAP_DEVICE_ASSIGNMENT 17
+#endif
+#define KVM_CAP_IOMMU 18
+#ifdef __KVM_HAVE_MSI
+#define KVM_CAP_DEVICE_MSI 20
+#endif
+/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
+#ifdef __KVM_HAVE_USER_NMI
+#define KVM_CAP_USER_NMI 22
+#endif
+#ifdef __KVM_HAVE_GUEST_DEBUG
+#define KVM_CAP_SET_GUEST_DEBUG 23
+#endif
+#ifdef __KVM_HAVE_PIT
+#define KVM_CAP_REINJECT_CONTROL 24
+#endif
+#ifdef __KVM_HAVE_IOAPIC
+#define KVM_CAP_IRQ_ROUTING 25
+#endif
+#define KVM_CAP_IRQ_INJECT_STATUS 26
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+#define KVM_CAP_DEVICE_DEASSIGNMENT 27
+#endif
+#ifdef __KVM_HAVE_MSIX
+#define KVM_CAP_DEVICE_MSIX 28
+#endif
+#define KVM_CAP_ASSIGN_DEV_IRQ 29
+/* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
+#ifdef __KVM_HAVE_MCE
+#define KVM_CAP_MCE 31
+#endif
+#define KVM_CAP_IRQFD 32
+#ifdef __KVM_HAVE_PIT
+#define KVM_CAP_PIT2 33
+#endif
+#define KVM_CAP_SET_BOOT_CPU_ID 34
+#ifdef __KVM_HAVE_PIT_STATE2
+#define KVM_CAP_PIT_STATE2 35
+#endif
+#define KVM_CAP_IOEVENTFD 36
+#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
+#ifdef __KVM_HAVE_XEN_HVM
+#define KVM_CAP_XEN_HVM 38
+#endif
+#define KVM_CAP_ADJUST_CLOCK 39
+#define KVM_CAP_INTERNAL_ERROR_DATA 40
+#ifdef __KVM_HAVE_VCPU_EVENTS
+#define KVM_CAP_VCPU_EVENTS 41
+#endif
+#define KVM_CAP_S390_PSW 42
+#define KVM_CAP_PPC_SEGSTATE 43
+#define KVM_CAP_HYPERV 44
+#define KVM_CAP_HYPERV_VAPIC 45
+#define KVM_CAP_HYPERV_SPIN 46
+#define KVM_CAP_PCI_SEGMENT 47
+#define KVM_CAP_PPC_PAIRED_SINGLES 48
+#define KVM_CAP_INTR_SHADOW 49
+#ifdef __KVM_HAVE_DEBUGREGS
+#define KVM_CAP_DEBUGREGS 50
+#endif
+#define KVM_CAP_X86_ROBUST_SINGLESTEP 51
+#define KVM_CAP_PPC_OSI 52
+#define KVM_CAP_PPC_UNSET_IRQ 53
+#define KVM_CAP_ENABLE_CAP 54
+#ifdef __KVM_HAVE_XSAVE
+#define KVM_CAP_XSAVE 55
+#endif
+#ifdef __KVM_HAVE_XCRS
+#define KVM_CAP_XCRS 56
+#endif
+#define KVM_CAP_PPC_GET_PVINFO 57
+#define KVM_CAP_PPC_IRQ_LEVEL 58
+#define KVM_CAP_ASYNC_PF 59
+
+#ifdef KVM_CAP_IRQ_ROUTING
+
+struct kvm_irq_routing_irqchip {
+ __u32 irqchip;
+ __u32 pin;
+};
+
+struct kvm_irq_routing_msi {
+ __u32 address_lo;
+ __u32 address_hi;
+ __u32 data;
+ __u32 pad;
+};
+
+/* gsi routing entry types */
+#define KVM_IRQ_ROUTING_IRQCHIP 1
+#define KVM_IRQ_ROUTING_MSI 2
+
+struct kvm_irq_routing_entry {
+ __u32 gsi;
+ __u32 type;
+ __u32 flags;
+ __u32 pad;
+ union {
+ struct kvm_irq_routing_irqchip irqchip;
+ struct kvm_irq_routing_msi msi;
+ __u32 pad[8];
+ } u;
+};
+
+struct kvm_irq_routing {
+ __u32 nr;
+ __u32 flags;
+ struct kvm_irq_routing_entry entries[0];
+};
+
+#endif
+
+#ifdef KVM_CAP_MCE
+/* x86 MCE */
+struct kvm_x86_mce {
+ __u64 status;
+ __u64 addr;
+ __u64 misc;
+ __u64 mcg_status;
+ __u8 bank;
+ __u8 pad1[7];
+ __u64 pad2[3];
+};
+#endif
+
+#ifdef KVM_CAP_XEN_HVM
+struct kvm_xen_hvm_config {
+ __u32 flags;
+ __u32 msr;
+ __u64 blob_addr_32;
+ __u64 blob_addr_64;
+ __u8 blob_size_32;
+ __u8 blob_size_64;
+ __u8 pad2[30];
+};
+#endif
+
+#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
+
+struct kvm_irqfd {
+ __u32 fd;
+ __u32 gsi;
+ __u32 flags;
+ __u8 pad[20];
+};
+
+struct kvm_clock_data {
+ __u64 clock;
+ __u32 flags;
+ __u32 pad[9];
+};
+
+/*
+ * ioctls for VM fds
+ */
+#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
+/*
+ * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
+ * a vcpu fd.
+ */
+#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
+#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
+/* KVM_SET_MEMORY_ALIAS is obsolete: */
+#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
+#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
+#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
+ struct kvm_userspace_memory_region)
+#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
+#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
+/* Device model IOC */
+#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
+#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
+#define KVM_GET_IRQCHIP _IOWR(KVMIO, 0x62, struct kvm_irqchip)
+#define KVM_SET_IRQCHIP _IOR(KVMIO, 0x63, struct kvm_irqchip)
+#define KVM_CREATE_PIT _IO(KVMIO, 0x64)
+#define KVM_GET_PIT _IOWR(KVMIO, 0x65, struct kvm_pit_state)
+#define KVM_SET_PIT _IOR(KVMIO, 0x66, struct kvm_pit_state)
+#define KVM_IRQ_LINE_STATUS _IOWR(KVMIO, 0x67, struct kvm_irq_level)
+#define KVM_REGISTER_COALESCED_MMIO \
+ _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone)
+#define KVM_UNREGISTER_COALESCED_MMIO \
+ _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone)
+#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
+ struct kvm_assigned_pci_dev)
+#define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
+/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
+#define KVM_ASSIGN_IRQ __KVM_DEPRECATED_VM_R_0x70
+#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq)
+#define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71)
+#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
+ struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_SET_MSIX_NR _IOW(KVMIO, 0x73, \
+ struct kvm_assigned_msix_nr)
+#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO, 0x74, \
+ struct kvm_assigned_msix_entry)
+#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
+#define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd)
+#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config)
+#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78)
+#define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
+#define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config)
+#define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data)
+#define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_PIT_STATE2 */
+#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2)
+#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
+/* Available with KVM_CAP_PPC_GET_PVINFO */
+#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo)
+
+/*
+ * ioctls for vcpu fds
+ */
+#define KVM_RUN _IO(KVMIO, 0x80)
+#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs)
+#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs)
+#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs)
+#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs)
+#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation)
+#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt)
+/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */
+#define KVM_DEBUG_GUEST __KVM_DEPRECATED_VCPU_W_0x87
+#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs)
+#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs)
+#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid)
+#define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask)
+#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu)
+#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
+#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state)
+#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state)
+#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2)
+#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr)
+/* valid for virtual machine (for floating interrupt)_and_ vcpu */
+#define KVM_S390_INTERRUPT _IOW(KVMIO, 0x94, struct kvm_s390_interrupt)
+/* store status for s390 */
+#define KVM_S390_STORE_STATUS_NOADDR (-1ul)
+#define KVM_S390_STORE_STATUS_PREFIXED (-2ul)
+#define KVM_S390_STORE_STATUS _IOW(KVMIO, 0x95, unsigned long)
+/* initial ipl psw for s390 */
+#define KVM_S390_SET_INITIAL_PSW _IOW(KVMIO, 0x96, struct kvm_s390_psw)
+/* initial reset for s390 */
+#define KVM_S390_INITIAL_RESET _IO(KVMIO, 0x97)
+#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state)
+#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state)
+/* Available with KVM_CAP_NMI */
+#define KVM_NMI _IO(KVMIO, 0x9a)
+/* Available with KVM_CAP_SET_GUEST_DEBUG */
+#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug)
+/* MCE for x86 */
+#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64)
+#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64)
+#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce)
+/* IA64 stack access */
+#define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *)
+#define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *)
+/* Available with KVM_CAP_VCPU_EVENTS */
+#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events)
+#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events)
+/* Available with KVM_CAP_DEBUGREGS */
+#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs)
+#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs)
+#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap)
+/* Available with KVM_CAP_XSAVE */
+#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave)
+#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave)
+/* Available with KVM_CAP_XCRS */
+#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs)
+#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
+
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
+
+struct kvm_assigned_pci_dev {
+ __u32 assigned_dev_id;
+ __u32 busnr;
+ __u32 devfn;
+ __u32 flags;
+ __u32 segnr;
+ union {
+ __u32 reserved[11];
+ };
+};
+
+#define KVM_DEV_IRQ_HOST_INTX (1 << 0)
+#define KVM_DEV_IRQ_HOST_MSI (1 << 1)
+#define KVM_DEV_IRQ_HOST_MSIX (1 << 2)
+
+#define KVM_DEV_IRQ_GUEST_INTX (1 << 8)
+#define KVM_DEV_IRQ_GUEST_MSI (1 << 9)
+#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10)
+
+#define KVM_DEV_IRQ_HOST_MASK 0x00ff
+#define KVM_DEV_IRQ_GUEST_MASK 0xff00
+
+struct kvm_assigned_irq {
+ __u32 assigned_dev_id;
+ __u32 host_irq;
+ __u32 guest_irq;
+ __u32 flags;
+ union {
+ struct {
+ __u32 addr_lo;
+ __u32 addr_hi;
+ __u32 data;
+ } guest_msi;
+ __u32 reserved[12];
+ };
+};
+
+
+struct kvm_assigned_msix_nr {
+ __u32 assigned_dev_id;
+ __u16 entry_nr;
+ __u16 padding;
+};
+
+#define KVM_MAX_MSIX_PER_DEV 256
+struct kvm_assigned_msix_entry {
+ __u32 assigned_dev_id;
+ __u32 gsi;
+ __u16 entry; /* The index of entry in the MSI-X table */
+ __u16 padding[3];
+};
+
+#endif /* __LINUX_KVM_H */
diff --git a/linux/include/linux/kvm_host.h b/linux/include/linux/kvm_host.h
new file mode 100644
index 0000000..84546bd
--- /dev/null
+++ b/linux/include/linux/kvm_host.h
@@ -0,0 +1,778 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#ifndef __KVM_HOST_H
+#define __KVM_HOST_H
+
+/*
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/hardirq.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/preempt.h>
+#include <linux/msi.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include <asm/signal.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+
+#include <linux/kvm_types.h>
+
+#include <asm/kvm_host.h>
+
+/*
+ * vcpu->requests bit members
+ */
+#define KVM_REQ_TLB_FLUSH 0
+#define KVM_REQ_MIGRATE_TIMER 1
+#define KVM_REQ_REPORT_TPR_ACCESS 2
+#define KVM_REQ_MMU_RELOAD 3
+#define KVM_REQ_TRIPLE_FAULT 4
+#define KVM_REQ_PENDING_TIMER 5
+#define KVM_REQ_UNHALT 6
+#define KVM_REQ_MMU_SYNC 7
+#define KVM_REQ_CLOCK_UPDATE 8
+#define KVM_REQ_KICK 9
+#define KVM_REQ_DEACTIVATE_FPU 10
+#define KVM_REQ_EVENT 11
+#define KVM_REQ_APF_HALT 12
+
+#define KVM_USERSPACE_IRQ_SOURCE_ID 0
+
+struct kvm;
+struct kvm_vcpu;
+extern struct kmem_cache *kvm_vcpu_cache;
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice. At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+ int dev_count;
+#define NR_IOBUS_DEVS 200
+ struct kvm_io_device *devs[NR_IOBUS_DEVS];
+};
+
+enum kvm_bus {
+ KVM_MMIO_BUS,
+ KVM_PIO_BUS,
+ KVM_NR_BUSES
+};
+
+int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ int len, const void *val);
+int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
+ void *val);
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev);
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev);
+
+#ifdef CONFIG_KVM_ASYNC_PF
+struct kvm_async_pf {
+ struct work_struct work;
+ struct list_head link;
+ struct list_head queue;
+ struct kvm_vcpu *vcpu;
+ struct mm_struct *mm;
+ gva_t gva;
+ unsigned long addr;
+ struct kvm_arch_async_pf arch;
+ struct page *page;
+ bool done;
+};
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+ struct kvm_arch_async_pf *arch);
+int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
+#endif
+
+struct kvm_vcpu {
+ struct kvm *kvm;
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ struct preempt_notifier preempt_notifier;
+#endif
+ int vcpu_id;
+ struct mutex mutex;
+ int cpu;
+ atomic_t guest_mode;
+ struct kvm_run *run;
+ unsigned long requests;
+ unsigned long guest_debug;
+ int srcu_idx;
+
+ int fpu_active;
+ int guest_fpu_loaded, guest_xcr0_loaded;
+ wait_queue_head_t wq;
+ int sigset_active;
+ sigset_t sigset;
+ struct kvm_vcpu_stat stat;
+
+#ifdef CONFIG_HAS_IOMEM
+ int mmio_needed;
+ int mmio_read_completed;
+ int mmio_is_write;
+ int mmio_size;
+ unsigned char mmio_data[8];
+ gpa_t mmio_phys_addr;
+#endif
+
+#ifdef CONFIG_KVM_ASYNC_PF
+ struct {
+ u32 queued;
+ struct list_head queue;
+ struct list_head done;
+ spinlock_t lock;
+ } async_pf;
+#endif
+
+ struct kvm_vcpu_arch arch;
+};
+
+/*
+ * Some of the bitops functions do not support too long bitmaps.
+ * This number must be determined not to exceed such limits.
+ */
+#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
+
+struct kvm_lpage_info {
+ unsigned long rmap_pde;
+ int write_count;
+};
+
+struct kvm_memory_slot {
+ gfn_t base_gfn;
+ unsigned long npages;
+ unsigned long flags;
+ unsigned long *rmap;
+ unsigned long *dirty_bitmap;
+ unsigned long *dirty_bitmap_head;
+ struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+ unsigned long userspace_addr;
+ int user_alloc;
+ int id;
+};
+
+static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
+{
+ return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+}
+
+struct kvm_kernel_irq_routing_entry {
+ u32 gsi;
+ u32 type;
+ int (*set)(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level);
+ union {
+ struct {
+ unsigned irqchip;
+ unsigned pin;
+ } irqchip;
+ struct msi_msg msi;
+ };
+ struct hlist_node link;
+};
+
+#ifdef __KVM_HAVE_IOAPIC
+
+struct kvm_irq_routing_table {
+ int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
+ struct kvm_kernel_irq_routing_entry *rt_entries;
+ u32 nr_rt_entries;
+ /*
+ * Array indexed by gsi. Each entry contains list of irq chips
+ * the gsi is connected to.
+ */
+ struct hlist_head map[0];
+};
+
+#else
+
+struct kvm_irq_routing_table {};
+
+#endif
+
+struct kvm_memslots {
+ int nmemslots;
+ u64 generation;
+ struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
+ KVM_PRIVATE_MEM_SLOTS];
+};
+
+struct kvm {
+ spinlock_t mmu_lock;
+ raw_spinlock_t requests_lock;
+ struct mutex slots_lock;
+ struct mm_struct *mm; /* userspace tied to this vm */
+ struct kvm_memslots *memslots;
+ struct srcu_struct srcu;
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+ u32 bsp_vcpu_id;
+ struct kvm_vcpu *bsp_vcpu;
+#endif
+ struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+ atomic_t online_vcpus;
+ struct list_head vm_list;
+ struct mutex lock;
+ struct kvm_io_bus *buses[KVM_NR_BUSES];
+#ifdef CONFIG_HAVE_KVM_EVENTFD
+ struct {
+ spinlock_t lock;
+ struct list_head items;
+ } irqfds;
+ struct list_head ioeventfds;
+#endif
+ struct kvm_vm_stat stat;
+ struct kvm_arch arch;
+ atomic_t users_count;
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+ struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
+ struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
+#endif
+
+ struct mutex irq_lock;
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+ /*
+ * Update side is protected by irq_lock and,
+ * if configured, irqfds.lock.
+ */
+ struct kvm_irq_routing_table __rcu *irq_routing;
+ struct hlist_head mask_notifier_list;
+ struct hlist_head irq_ack_notifier_list;
+#endif
+
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+ struct mmu_notifier mmu_notifier;
+ unsigned long mmu_notifier_seq;
+ long mmu_notifier_count;
+#endif
+ long tlbs_dirty;
+};
+
+/* The guest did something we don't support. */
+#define pr_unimpl(vcpu, fmt, ...) \
+ do { \
+ if (printk_ratelimit()) \
+ printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
+ current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
+ } while (0)
+
+#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
+#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
+
+static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
+{
+ smp_rmb();
+ return kvm->vcpus[i];
+}
+
+#define kvm_for_each_vcpu(idx, vcpup, kvm) \
+ for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \
+ idx < atomic_read(&kvm->online_vcpus) && vcpup; \
+ vcpup = kvm_get_vcpu(kvm, ++idx))
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void vcpu_load(struct kvm_vcpu *vcpu);
+void vcpu_put(struct kvm_vcpu *vcpu);
+
+int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+ struct module *module);
+void kvm_exit(void);
+
+void kvm_get_kvm(struct kvm *kvm);
+void kvm_put_kvm(struct kvm *kvm);
+
+static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
+{
+ return rcu_dereference_check(kvm->memslots,
+ srcu_read_lock_held(&kvm->srcu)
+ || lockdep_is_held(&kvm->slots_lock));
+}
+
+#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
+#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
+static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
+
+extern struct page *bad_page;
+extern pfn_t bad_pfn;
+
+int is_error_page(struct page *page);
+int is_error_pfn(pfn_t pfn);
+int is_hwpoison_pfn(pfn_t pfn);
+int is_fault_pfn(pfn_t pfn);
+int kvm_is_error_hva(unsigned long addr);
+int kvm_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc);
+int __kvm_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc);
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc);
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc);
+void kvm_disable_largepages(void);
+void kvm_arch_flush_shadow(struct kvm *kvm);
+
+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
+ int nr_pages);
+
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
+void kvm_release_page_clean(struct page *page);
+void kvm_release_page_dirty(struct page *page);
+void kvm_set_page_dirty(struct page *page);
+void kvm_set_page_accessed(struct page *page);
+
+pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
+ bool write_fault, bool *writable);
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+ bool *writable);
+pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn);
+int memslot_id(struct kvm *kvm, gfn_t gfn);
+void kvm_release_pfn_dirty(pfn_t);
+void kvm_release_pfn_clean(pfn_t pfn);
+void kvm_set_pfn_dirty(pfn_t pfn);
+void kvm_set_pfn_accessed(pfn_t pfn);
+void kvm_get_pfn(pfn_t pfn);
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+ int len);
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len);
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+ int offset, int len);
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+ unsigned long len);
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len);
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+ gpa_t gpa);
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ gfn_t gfn);
+
+void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
+void kvm_resched(struct kvm_vcpu *vcpu);
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
+void kvm_flush_remote_tlbs(struct kvm *kvm);
+void kvm_reload_remote_mmus(struct kvm *kvm);
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg);
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg);
+
+int kvm_dev_ioctl_check_extension(long ext);
+
+int kvm_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log, int *is_dirty);
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+ struct
+ kvm_userspace_memory_region *mem,
+ int user_alloc);
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg);
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr);
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state);
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state);
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg);
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+
+int kvm_arch_init(void *opaque);
+void kvm_arch_exit(void);
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
+int kvm_arch_hardware_enable(void *garbage);
+void kvm_arch_hardware_disable(void *garbage);
+int kvm_arch_hardware_setup(void);
+void kvm_arch_hardware_unsetup(void);
+void kvm_arch_check_processor_compat(void *rtn);
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+
+void kvm_free_physmem(struct kvm *kvm);
+
+#ifndef __KVM_HAVE_ARCH_VM_ALLOC
+static inline struct kvm *kvm_arch_alloc_vm(void)
+{
+ return kzalloc(sizeof(struct kvm), GFP_KERNEL);
+}
+
+static inline void kvm_arch_free_vm(struct kvm *kvm)
+{
+ kfree(kvm);
+}
+#endif
+
+int kvm_arch_init_vm(struct kvm *kvm);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+void kvm_free_all_assigned_devices(struct kvm *kvm);
+void kvm_arch_sync_events(struct kvm *kvm);
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+
+int kvm_is_mmio_pfn(pfn_t pfn);
+
+struct kvm_irq_ack_notifier {
+ struct hlist_node link;
+ unsigned gsi;
+ void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
+};
+
+struct kvm_assigned_dev_kernel {
+ struct kvm_irq_ack_notifier ack_notifier;
+ struct list_head list;
+ int assigned_dev_id;
+ int host_segnr;
+ int host_busnr;
+ int host_devfn;
+ unsigned int entries_nr;
+ int host_irq;
+ bool host_irq_disabled;
+ struct msix_entry *host_msix_entries;
+ int guest_irq;
+ struct msix_entry *guest_msix_entries;
+ unsigned long irq_requested_type;
+ int irq_source_id;
+ int flags;
+ struct pci_dev *dev;
+ struct kvm *kvm;
+ spinlock_t intx_lock;
+ char irq_name[32];
+};
+
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
+};
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
+#ifdef __KVM_HAVE_IOAPIC
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+ union kvm_ioapic_redirect_entry *entry,
+ unsigned long *deliver_bitmask);
+#endif
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
+ int irq_source_id, int level);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+ struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+ struct kvm_irq_ack_notifier *kian);
+int kvm_request_irq_source_id(struct kvm *kvm);
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
+
+/* For vcpu->arch.iommu_flags */
+#define KVM_IOMMU_CACHE_COHERENCY 0x1
+
+#ifdef CONFIG_IOMMU_API
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
+int kvm_iommu_map_guest(struct kvm *kvm);
+int kvm_iommu_unmap_guest(struct kvm *kvm);
+int kvm_assign_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev);
+int kvm_deassign_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev);
+#else /* CONFIG_IOMMU_API */
+static inline int kvm_iommu_map_pages(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ return 0;
+}
+
+static inline int kvm_iommu_map_guest(struct kvm *kvm)
+{
+ return -ENODEV;
+}
+
+static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+ return 0;
+}
+
+static inline int kvm_assign_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ return 0;
+}
+
+static inline int kvm_deassign_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ return 0;
+}
+#endif /* CONFIG_IOMMU_API */
+
+static inline void kvm_guest_enter(void)
+{
+ account_system_vtime(current);
+ current->flags |= PF_VCPU;
+}
+
+static inline void kvm_guest_exit(void)
+{
+ account_system_vtime(current);
+ current->flags &= ~PF_VCPU;
+}
+
+static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+}
+
+static inline gpa_t gfn_to_gpa(gfn_t gfn)
+{
+ return (gpa_t)gfn << PAGE_SHIFT;
+}
+
+static inline gfn_t gpa_to_gfn(gpa_t gpa)
+{
+ return (gfn_t)(gpa >> PAGE_SHIFT);
+}
+
+static inline hpa_t pfn_to_hpa(pfn_t pfn)
+{
+ return (hpa_t)pfn << PAGE_SHIFT;
+}
+
+static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
+}
+
+enum kvm_stat_kind {
+ KVM_STAT_VM,
+ KVM_STAT_VCPU,
+};
+
+struct kvm_stats_debugfs_item {
+ const char *name;
+ int offset;
+ enum kvm_stat_kind kind;
+ struct dentry *dentry;
+};
+extern struct kvm_stats_debugfs_item debugfs_entries[];
+extern struct dentry *kvm_debugfs_dir;
+
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
+{
+ if (unlikely(vcpu->kvm->mmu_notifier_count))
+ return 1;
+ /*
+ * Both reads happen under the mmu_lock and both values are
+ * modified under mmu_lock, so there's no need of smb_rmb()
+ * here in between, otherwise mmu_notifier_count should be
+ * read before mmu_notifier_seq, see
+ * mmu_notifier_invalidate_range_end write side.
+ */
+ if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
+ return 1;
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+
+#define KVM_MAX_IRQ_ROUTES 1024
+
+int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_set_irq_routing(struct kvm *kvm,
+ const struct kvm_irq_routing_entry *entries,
+ unsigned nr,
+ unsigned flags);
+void kvm_free_irq_routing(struct kvm *kvm);
+
+#else
+
+static inline void kvm_free_irq_routing(struct kvm *kvm) {}
+
+#endif
+
+#ifdef CONFIG_HAVE_KVM_EVENTFD
+
+void kvm_eventfd_init(struct kvm *kvm);
+int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
+void kvm_irqfd_release(struct kvm *kvm);
+void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
+int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
+
+#else
+
+static inline void kvm_eventfd_init(struct kvm *kvm) {}
+
+static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
+{
+ return -EINVAL;
+}
+
+static inline void kvm_irqfd_release(struct kvm *kvm) {}
+
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+static inline void kvm_irq_routing_update(struct kvm *kvm,
+ struct kvm_irq_routing_table *irq_rt)
+{
+ rcu_assign_pointer(kvm->irq_routing, irq_rt);
+}
+#endif
+
+static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+ return -ENOSYS;
+}
+
+#endif /* CONFIG_HAVE_KVM_EVENTFD */
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
+}
+#endif
+
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg);
+
+#else
+
+static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg)
+{
+ return -ENOTTY;
+}
+
+#endif
+
+static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
+{
+ set_bit(req, &vcpu->requests);
+}
+
+static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
+{
+ return test_and_set_bit(req, &vcpu->requests);
+}
+
+static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
+{
+ if (test_bit(req, &vcpu->requests)) {
+ clear_bit(req, &vcpu->requests);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+#endif
+
diff --git a/linux/include/linux/kvm_para.h b/linux/include/linux/kvm_para.h
new file mode 100644
index 0000000..e13e53d
--- /dev/null
+++ b/linux/include/linux/kvm_para.h
@@ -0,0 +1,78 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#ifndef __LINUX_KVM_PARA_H
+#define __LINUX_KVM_PARA_H
+
+/*
+ * This header file provides a method for making a hypercall to the host
+ * Architectures should define:
+ * - kvm_hypercall0, kvm_hypercall1...
+ * - kvm_arch_para_features
+ * - kvm_para_available
+ */
+
+/* Return values for hypercalls */
+#define KVM_ENOSYS 1000
+#define KVM_EFAULT EFAULT
+#define KVM_E2BIG E2BIG
+#define KVM_EPERM EPERM
+
+#define KVM_HC_VAPIC_POLL_IRQ 1
+#define KVM_HC_MMU_OP 2
+#define KVM_HC_FEATURES 3
+#define KVM_HC_PPC_MAP_MAGIC_PAGE 4
+
+/*
+ * hypercalls use architecture specific
+ */
+#include <asm/kvm_para.h>
+
+#ifdef __KERNEL__
+
+static inline int kvm_para_has_feature(unsigned int feature)
+{
+ if (kvm_arch_para_features() & (1UL << feature))
+ return 1;
+ return 0;
+}
+#endif /* __KERNEL__ */
+#endif /* __LINUX_KVM_PARA_H */
+
diff --git a/linux/include/linux/kvm_types.h b/linux/include/linux/kvm_types.h
new file mode 100644
index 0000000..e35297a
--- /dev/null
+++ b/linux/include/linux/kvm_types.h
@@ -0,0 +1,117 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef __KVM_TYPES_H__
+#define __KVM_TYPES_H__
+
+#include <asm/types.h>
+
+/*
+ * Address types:
+ *
+ * gva - guest virtual address
+ * gpa - guest physical address
+ * gfn - guest frame number
+ * hva - host virtual address
+ * hpa - host physical address
+ * hfn - host frame number
+ */
+
+typedef unsigned long gva_t;
+typedef u64 gpa_t;
+typedef u64 gfn_t;
+
+typedef unsigned long hva_t;
+typedef u64 hpa_t;
+typedef u64 hfn_t;
+
+typedef hfn_t pfn_t;
+
+union kvm_ioapic_redirect_entry {
+ u64 bits;
+ struct {
+ u8 vector;
+ u8 delivery_mode:3;
+ u8 dest_mode:1;
+ u8 delivery_status:1;
+ u8 polarity:1;
+ u8 remote_irr:1;
+ u8 trig_mode:1;
+ u8 mask:1;
+ u8 reserve:7;
+ u8 reserved[4];
+ u8 dest_id;
+ } fields;
+};
+
+struct kvm_lapic_irq {
+ u32 vector;
+ u32 delivery_mode;
+ u32 dest_mode;
+ u32 level;
+ u32 trig_mode;
+ u32 shorthand;
+ u32 dest_id;
+};
+
+struct gfn_to_hva_cache {
+ u64 generation;
+ gpa_t gpa;
+ unsigned long hva;
+ struct kvm_memory_slot *memslot;
+};
+
+#endif /* __KVM_TYPES_H__ */
diff --git a/linux/include/trace/events/kvm.h b/linux/include/trace/events/kvm.h
new file mode 100644
index 0000000..1c1c1f7
--- /dev/null
+++ b/linux/include/trace/events/kvm.h
@@ -0,0 +1,352 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+#if !defined(_TRACE_KVM_MAIN_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_MAIN_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+#define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x }
+
+#define kvm_trace_exit_reason \
+ ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \
+ ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \
+ ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \
+ ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
+ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI)
+
+TRACE_EVENT(kvm_userspace_exit,
+ TP_PROTO(__u32 reason, int errno),
+ TP_ARGS(reason, errno),
+
+ TP_STRUCT__entry(
+ __field( __u32, reason )
+ __field( int, errno )
+ ),
+
+ TP_fast_assign(
+ __entry->reason = reason;
+ __entry->errno = errno;
+ ),
+
+ TP_printk("reason %s (%d)",
+ __entry->errno < 0 ?
+ (__entry->errno == -EINTR ? "restart" : "error") :
+ __print_symbolic(__entry->reason, kvm_trace_exit_reason),
+ __entry->errno < 0 ? -__entry->errno : __entry->reason)
+);
+
+#if defined(__KVM_HAVE_IOAPIC)
+TRACE_EVENT(kvm_set_irq,
+ TP_PROTO(unsigned int gsi, int level, int irq_source_id),
+ TP_ARGS(gsi, level, irq_source_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, gsi )
+ __field( int, level )
+ __field( int, irq_source_id )
+ ),
+
+ TP_fast_assign(
+ __entry->gsi = gsi;
+ __entry->level = level;
+ __entry->irq_source_id = irq_source_id;
+ ),
+
+ TP_printk("gsi %u level %d source %d",
+ __entry->gsi, __entry->level, __entry->irq_source_id)
+);
+
+#define kvm_deliver_mode \
+ {0x0, "Fixed"}, \
+ {0x1, "LowPrio"}, \
+ {0x2, "SMI"}, \
+ {0x3, "Res3"}, \
+ {0x4, "NMI"}, \
+ {0x5, "INIT"}, \
+ {0x6, "SIPI"}, \
+ {0x7, "ExtINT"}
+
+TRACE_EVENT(kvm_ioapic_set_irq,
+ TP_PROTO(__u64 e, int pin, bool coalesced),
+ TP_ARGS(e, pin, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ __field( int, pin )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ __entry->pin = pin;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("pin %u dst %x vec=%u (%s|%s|%s%s)%s",
+ __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+TRACE_EVENT(kvm_msi_set_irq,
+ TP_PROTO(__u64 address, __u64 data),
+ TP_ARGS(address, data),
+
+ TP_STRUCT__entry(
+ __field( __u64, address )
+ __field( __u64, data )
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->data = data;
+ ),
+
+ TP_printk("dst %u vec %x (%s|%s|%s%s)",
+ (u8)(__entry->address >> 12), (u8)__entry->data,
+ __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->address & (1<<2)) ? "logical" : "physical",
+ (__entry->data & (1<<15)) ? "level" : "edge",
+ (__entry->address & (1<<3)) ? "|rh" : "")
+);
+
+#define kvm_irqchips \
+ {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \
+ {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \
+ {KVM_IRQCHIP_IOAPIC, "IOAPIC"}
+
+TRACE_EVENT(kvm_ack_irq,
+ TP_PROTO(unsigned int irqchip, unsigned int pin),
+ TP_ARGS(irqchip, pin),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irqchip )
+ __field( unsigned int, pin )
+ ),
+
+ TP_fast_assign(
+ __entry->irqchip = irqchip;
+ __entry->pin = pin;
+ ),
+
+ TP_printk("irqchip %s pin %u",
+ __print_symbolic(__entry->irqchip, kvm_irqchips),
+ __entry->pin)
+);
+
+
+
+#endif /* defined(__KVM_HAVE_IOAPIC) */
+
+#define KVM_TRACE_MMIO_READ_UNSATISFIED 0
+#define KVM_TRACE_MMIO_READ 1
+#define KVM_TRACE_MMIO_WRITE 2
+
+#define kvm_trace_symbol_mmio \
+ { KVM_TRACE_MMIO_READ_UNSATISFIED, "unsatisfied-read" }, \
+ { KVM_TRACE_MMIO_READ, "read" }, \
+ { KVM_TRACE_MMIO_WRITE, "write" }
+
+TRACE_EVENT(kvm_mmio,
+ TP_PROTO(int type, int len, u64 gpa, u64 val),
+ TP_ARGS(type, len, gpa, val),
+
+ TP_STRUCT__entry(
+ __field( u32, type )
+ __field( u32, len )
+ __field( u64, gpa )
+ __field( u64, val )
+ ),
+
+ TP_fast_assign(
+ __entry->type = type;
+ __entry->len = len;
+ __entry->gpa = gpa;
+ __entry->val = val;
+ ),
+
+ TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx",
+ __print_symbolic(__entry->type, kvm_trace_symbol_mmio),
+ __entry->len, __entry->gpa, __entry->val)
+);
+
+#define kvm_fpu_load_symbol \
+ {0, "unload"}, \
+ {1, "load"}
+
+TRACE_EVENT(kvm_fpu,
+ TP_PROTO(int load),
+ TP_ARGS(load),
+
+ TP_STRUCT__entry(
+ __field( u32, load )
+ ),
+
+ TP_fast_assign(
+ __entry->load = load;
+ ),
+
+ TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol))
+);
+
+TRACE_EVENT(kvm_age_page,
+ TP_PROTO(ulong hva, struct kvm_memory_slot *slot, int ref),
+ TP_ARGS(hva, slot, ref),
+
+ TP_STRUCT__entry(
+ __field( u64, hva )
+ __field( u64, gfn )
+ __field( u8, referenced )
+ ),
+
+ TP_fast_assign(
+ __entry->hva = hva;
+ __entry->gfn =
+ slot->base_gfn + ((hva - slot->userspace_addr) >> PAGE_SHIFT);
+ __entry->referenced = ref;
+ ),
+
+ TP_printk("hva %llx gfn %llx %s",
+ __entry->hva, __entry->gfn,
+ __entry->referenced ? "YOUNG" : "OLD")
+);
+
+#ifdef CONFIG_KVM_ASYNC_PF
+DECLARE_EVENT_CLASS(kvm_async_get_page_class,
+
+ TP_PROTO(u64 gva, u64 gfn),
+
+ TP_ARGS(gva, gfn),
+
+ TP_STRUCT__entry(
+ __field(__u64, gva)
+ __field(u64, gfn)
+ ),
+
+ TP_fast_assign(
+ __entry->gva = gva;
+ __entry->gfn = gfn;
+ ),
+
+ TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
+);
+
+DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page,
+
+ TP_PROTO(u64 gva, u64 gfn),
+
+ TP_ARGS(gva, gfn)
+);
+
+DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault,
+
+ TP_PROTO(u64 gva, u64 gfn),
+
+ TP_ARGS(gva, gfn)
+);
+
+DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready,
+
+ TP_PROTO(u64 token, u64 gva),
+
+ TP_ARGS(token, gva),
+
+ TP_STRUCT__entry(
+ __field(__u64, token)
+ __field(__u64, gva)
+ ),
+
+ TP_fast_assign(
+ __entry->token = token;
+ __entry->gva = gva;
+ ),
+
+ TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva)
+
+);
+
+DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present,
+
+ TP_PROTO(u64 token, u64 gva),
+
+ TP_ARGS(token, gva)
+);
+
+DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready,
+
+ TP_PROTO(u64 token, u64 gva),
+
+ TP_ARGS(token, gva)
+);
+
+TRACE_EVENT(
+ kvm_async_pf_completed,
+ TP_PROTO(unsigned long address, struct page *page, u64 gva),
+ TP_ARGS(address, page, gva),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, address)
+ __field(pfn_t, pfn)
+ __field(u64, gva)
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->pfn = page ? page_to_pfn(page) : 0;
+ __entry->gva = gva;
+ ),
+
+ TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva,
+ __entry->address, __entry->pfn)
+);
+
+#endif
+
+#endif /* _TRACE_KVM_MAIN_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/linux/usr/include/asm-ia64/kvm.h b/linux/usr/include/asm-ia64/kvm.h
new file mode 100644
index 0000000..bc90c75
--- /dev/null
+++ b/linux/usr/include/asm-ia64/kvm.h
@@ -0,0 +1,264 @@
+#ifndef __ASM_IA64_KVM_H
+#define __ASM_IA64_KVM_H
+
+/*
+ * kvm structure definitions for ia64
+ *
+ * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/* Select x86 specific features in <linux/kvm.h> */
+#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_DEVICE_ASSIGNMENT
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+#define KVM_IOAPIC_NUM_PINS 48
+
+struct kvm_ioapic_state {
+ __u64 base_address;
+ __u32 ioregsel;
+ __u32 id;
+ __u32 irr;
+ __u32 pad;
+ union {
+ __u64 bits;
+ struct {
+ __u8 vector;
+ __u8 delivery_mode:3;
+ __u8 dest_mode:1;
+ __u8 delivery_status:1;
+ __u8 polarity:1;
+ __u8 remote_irr:1;
+ __u8 trig_mode:1;
+ __u8 mask:1;
+ __u8 reserve:7;
+ __u8 reserved[4];
+ __u8 dest_id;
+ } fields;
+ } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER 0
+#define KVM_IRQCHIP_PIC_SLAVE 1
+#define KVM_IRQCHIP_IOAPIC 2
+#define KVM_NR_IRQCHIPS 3
+
+#define KVM_CONTEXT_SIZE 8*1024
+
+struct kvm_fpreg {
+ union {
+ unsigned long bits[2];
+ long double __dummy; /* force 16-byte alignment */
+ } u;
+};
+
+union context {
+ /* 8K size */
+ char dummy[KVM_CONTEXT_SIZE];
+ struct {
+ unsigned long psr;
+ unsigned long pr;
+ unsigned long caller_unat;
+ unsigned long pad;
+ unsigned long gr[32];
+ unsigned long ar[128];
+ unsigned long br[8];
+ unsigned long cr[128];
+ unsigned long rr[8];
+ unsigned long ibr[8];
+ unsigned long dbr[8];
+ unsigned long pkr[8];
+ struct kvm_fpreg fr[128];
+ };
+};
+
+struct thash_data {
+ union {
+ struct {
+ unsigned long p : 1; /* 0 */
+ unsigned long rv1 : 1; /* 1 */
+ unsigned long ma : 3; /* 2-4 */
+ unsigned long a : 1; /* 5 */
+ unsigned long d : 1; /* 6 */
+ unsigned long pl : 2; /* 7-8 */
+ unsigned long ar : 3; /* 9-11 */
+ unsigned long ppn : 38; /* 12-49 */
+ unsigned long rv2 : 2; /* 50-51 */
+ unsigned long ed : 1; /* 52 */
+ unsigned long ig1 : 11; /* 53-63 */
+ };
+ struct {
+ unsigned long __rv1 : 53; /* 0-52 */
+ unsigned long contiguous : 1; /*53 */
+ unsigned long tc : 1; /* 54 TR or TC */
+ unsigned long cl : 1;
+ /* 55 I side or D side cache line */
+ unsigned long len : 4; /* 56-59 */
+ unsigned long io : 1; /* 60 entry is for io or not */
+ unsigned long nomap : 1;
+ /* 61 entry cann't be inserted into machine TLB.*/
+ unsigned long checked : 1;
+ /* 62 for VTLB/VHPT sanity check */
+ unsigned long invalid : 1;
+ /* 63 invalid entry */
+ };
+ unsigned long page_flags;
+ }; /* same for VHPT and TLB */
+
+ union {
+ struct {
+ unsigned long rv3 : 2;
+ unsigned long ps : 6;
+ unsigned long key : 24;
+ unsigned long rv4 : 32;
+ };
+ unsigned long itir;
+ };
+ union {
+ struct {
+ unsigned long ig2 : 12;
+ unsigned long vpn : 49;
+ unsigned long vrn : 3;
+ };
+ unsigned long ifa;
+ unsigned long vadr;
+ struct {
+ unsigned long tag : 63;
+ unsigned long ti : 1;
+ };
+ unsigned long etag;
+ };
+ union {
+ struct thash_data *next;
+ unsigned long rid;
+ unsigned long gpaddr;
+ };
+};
+
+#define NITRS 8
+#define NDTRS 8
+
+struct saved_vpd {
+ unsigned long vhpi;
+ unsigned long vgr[16];
+ unsigned long vbgr[16];
+ unsigned long vnat;
+ unsigned long vbnat;
+ unsigned long vcpuid[5];
+ unsigned long vpsr;
+ unsigned long vpr;
+ union {
+ unsigned long vcr[128];
+ struct {
+ unsigned long dcr;
+ unsigned long itm;
+ unsigned long iva;
+ unsigned long rsv1[5];
+ unsigned long pta;
+ unsigned long rsv2[7];
+ unsigned long ipsr;
+ unsigned long isr;
+ unsigned long rsv3;
+ unsigned long iip;
+ unsigned long ifa;
+ unsigned long itir;
+ unsigned long iipa;
+ unsigned long ifs;
+ unsigned long iim;
+ unsigned long iha;
+ unsigned long rsv4[38];
+ unsigned long lid;
+ unsigned long ivr;
+ unsigned long tpr;
+ unsigned long eoi;
+ unsigned long irr[4];
+ unsigned long itv;
+ unsigned long pmv;
+ unsigned long cmcv;
+ unsigned long rsv5[5];
+ unsigned long lrr0;
+ unsigned long lrr1;
+ unsigned long rsv6[46];
+ };
+ };
+};
+
+struct kvm_regs {
+ struct saved_vpd vpd;
+ /*Arch-regs*/
+ int mp_state;
+ unsigned long vmm_rr;
+ /* TR and TC. */
+ struct thash_data itrs[NITRS];
+ struct thash_data dtrs[NDTRS];
+ /* Bit is set if there is a tr/tc for the region. */
+ unsigned char itr_regions;
+ unsigned char dtr_regions;
+ unsigned char tc_regions;
+
+ char irq_check;
+ unsigned long saved_itc;
+ unsigned long itc_check;
+ unsigned long timer_check;
+ unsigned long timer_pending;
+ unsigned long last_itc;
+
+ unsigned long vrr[8];
+ unsigned long ibr[8];
+ unsigned long dbr[8];
+ unsigned long insvc[4]; /* Interrupt in service. */
+ unsigned long xtp;
+
+ unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */
+ unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */
+ unsigned long metaphysical_saved_rr0; /* from kvm_arch */
+ unsigned long metaphysical_saved_rr4; /* from kvm_arch */
+ unsigned long fp_psr; /*used for lazy float register */
+ unsigned long saved_gp;
+ /*for phycial emulation */
+
+ union context saved_guest;
+
+ unsigned long reserved[64]; /* for future use */
+};
+
+struct kvm_sregs {
+};
+
+struct kvm_fpu {
+};
+
+#define KVM_IA64_VCPU_STACK_SHIFT 16
+#define KVM_IA64_VCPU_STACK_SIZE (1UL << KVM_IA64_VCPU_STACK_SHIFT)
+
+struct kvm_ia64_vcpu_stack {
+ unsigned char stack[KVM_IA64_VCPU_STACK_SIZE];
+};
+
+struct kvm_debug_exit_arch {
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
+#endif
diff --git a/linux/usr/include/asm-ia64/kvm_para.h b/linux/usr/include/asm-ia64/kvm_para.h
new file mode 100644
index 0000000..0310eb4
--- /dev/null
+++ b/linux/usr/include/asm-ia64/kvm_para.h
@@ -0,0 +1,23 @@
+#ifndef __IA64_KVM_PARA_H
+#define __IA64_KVM_PARA_H
+
+/*
+ * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+
+#endif
diff --git a/linux/usr/include/asm-x86/hyperv.h b/linux/usr/include/asm-x86/hyperv.h
new file mode 100644
index 0000000..5df477a
--- /dev/null
+++ b/linux/usr/include/asm-x86/hyperv.h
@@ -0,0 +1,193 @@
+#ifndef _ASM_X86_HYPERV_H
+#define _ASM_X86_HYPERV_H
+
+#include <linux/types.h>
+
+/*
+ * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
+ * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
+ */
+#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
+#define HYPERV_CPUID_INTERFACE 0x40000001
+#define HYPERV_CPUID_VERSION 0x40000002
+#define HYPERV_CPUID_FEATURES 0x40000003
+#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
+#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+
+#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
+#define HYPERV_CPUID_MIN 0x40000005
+#define HYPERV_CPUID_MAX 0x4000ffff
+
+/*
+ * Feature identification. EAX indicates which features are available
+ * to the partition based upon the current partition privileges.
+ */
+
+/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
+#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
+/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
+#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
+/*
+ * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
+ * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
+ */
+#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
+/*
+ * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
+ * HV_X64_MSR_STIMER3_COUNT) available
+ */
+#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
+/*
+ * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
+ * are available
+ */
+#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
+/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
+#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
+/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
+#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
+/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
+#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
+ /*
+ * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
+ */
+#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+
+/*
+ * Feature identification: EBX indicates which flags were specified at
+ * partition creation. The format is the same as the partition creation
+ * flag structure defined in section Partition Creation Flags.
+ */
+#define HV_X64_CREATE_PARTITIONS (1 << 0)
+#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
+#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
+#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
+#define HV_X64_POST_MESSAGES (1 << 4)
+#define HV_X64_SIGNAL_EVENTS (1 << 5)
+#define HV_X64_CREATE_PORT (1 << 6)
+#define HV_X64_CONNECT_PORT (1 << 7)
+#define HV_X64_ACCESS_STATS (1 << 8)
+#define HV_X64_DEBUGGING (1 << 11)
+#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
+#define HV_X64_CONFIGURE_PROFILER (1 << 13)
+
+/*
+ * Feature identification. EDX indicates which miscellaneous features
+ * are available to the partition.
+ */
+/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
+#define HV_X64_MWAIT_AVAILABLE (1 << 0)
+/* Guest debugging support is available */
+#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
+/* Performance Monitor support is available*/
+#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
+/* Support for physical CPU dynamic partitioning events is available*/
+#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
+/*
+ * Support for passing hypercall input parameter block via XMM
+ * registers is available
+ */
+#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
+/* Support for a virtual guest idle state is available */
+#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+
+/*
+ * Implementation recommendations. Indicates which behaviors the hypervisor
+ * recommends the OS implement for optimal performance.
+ */
+ /*
+ * Recommend using hypercall for address space switches rather
+ * than MOV to CR3 instruction
+ */
+#define HV_X64_MWAIT_RECOMMENDED (1 << 0)
+/* Recommend using hypercall for local TLB flushes rather
+ * than INVLPG or MOV to CR3 instructions */
+#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
+/*
+ * Recommend using hypercall for remote TLB flushes rather
+ * than inter-processor interrupts
+ */
+#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
+/*
+ * Recommend using MSRs for accessing APIC registers
+ * EOI, ICR and TPR rather than their memory-mapped counterparts
+ */
+#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
+/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
+#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
+/*
+ * Recommend using relaxed timing for this partition. If used,
+ * the VM should disable any watchdog timeouts that rely on the
+ * timely delivery of external interrupts
+ */
+#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
+
+/* MSR used to identify the guest OS. */
+#define HV_X64_MSR_GUEST_OS_ID 0x40000000
+
+/* MSR used to setup pages used to communicate with the hypervisor. */
+#define HV_X64_MSR_HYPERCALL 0x40000001
+
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX 0x40000002
+
+/* MSR used to read the per-partition time reference counter */
+#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
+
+/* Define the virtual APIC registers */
+#define HV_X64_MSR_EOI 0x40000070
+#define HV_X64_MSR_ICR 0x40000071
+#define HV_X64_MSR_TPR 0x40000072
+#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
+
+/* Define synthetic interrupt controller model specific registers. */
+#define HV_X64_MSR_SCONTROL 0x40000080
+#define HV_X64_MSR_SVERSION 0x40000081
+#define HV_X64_MSR_SIEFP 0x40000082
+#define HV_X64_MSR_SIMP 0x40000083
+#define HV_X64_MSR_EOM 0x40000084
+#define HV_X64_MSR_SINT0 0x40000090
+#define HV_X64_MSR_SINT1 0x40000091
+#define HV_X64_MSR_SINT2 0x40000092
+#define HV_X64_MSR_SINT3 0x40000093
+#define HV_X64_MSR_SINT4 0x40000094
+#define HV_X64_MSR_SINT5 0x40000095
+#define HV_X64_MSR_SINT6 0x40000096
+#define HV_X64_MSR_SINT7 0x40000097
+#define HV_X64_MSR_SINT8 0x40000098
+#define HV_X64_MSR_SINT9 0x40000099
+#define HV_X64_MSR_SINT10 0x4000009A
+#define HV_X64_MSR_SINT11 0x4000009B
+#define HV_X64_MSR_SINT12 0x4000009C
+#define HV_X64_MSR_SINT13 0x4000009D
+#define HV_X64_MSR_SINT14 0x4000009E
+#define HV_X64_MSR_SINT15 0x4000009F
+
+
+#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
+
+/* Declare the various hypercall operations. */
+#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
+
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+#define HV_PROCESSOR_POWER_STATE_C0 0
+#define HV_PROCESSOR_POWER_STATE_C1 1
+#define HV_PROCESSOR_POWER_STATE_C2 2
+#define HV_PROCESSOR_POWER_STATE_C3 3
+
+/* hypercall status code */
+#define HV_STATUS_SUCCESS 0
+#define HV_STATUS_INVALID_HYPERCALL_CODE 2
+#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
+#define HV_STATUS_INVALID_ALIGNMENT 4
+
+#endif
diff --git a/linux/usr/include/asm-x86/kvm.h b/linux/usr/include/asm-x86/kvm.h
new file mode 100644
index 0000000..4d8dcbd
--- /dev/null
+++ b/linux/usr/include/asm-x86/kvm.h
@@ -0,0 +1,324 @@
+#ifndef _ASM_X86_KVM_H
+#define _ASM_X86_KVM_H
+
+/*
+ * KVM x86 specific structures and definitions
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/* Select x86 specific features in <linux/kvm.h> */
+#define __KVM_HAVE_PIT
+#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_DEVICE_ASSIGNMENT
+#define __KVM_HAVE_MSI
+#define __KVM_HAVE_USER_NMI
+#define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_MSIX
+#define __KVM_HAVE_MCE
+#define __KVM_HAVE_PIT_STATE2
+#define __KVM_HAVE_XEN_HVM
+#define __KVM_HAVE_VCPU_EVENTS
+#define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+struct kvm_memory_alias {
+ __u32 slot; /* this has a different namespace than memory slots */
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size;
+ __u64 target_phys_addr;
+};
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+ __u8 last_irr; /* edge detection */
+ __u8 irr; /* interrupt request register */
+ __u8 imr; /* interrupt mask register */
+ __u8 isr; /* interrupt service register */
+ __u8 priority_add; /* highest irq priority */
+ __u8 irq_base;
+ __u8 read_reg_select;
+ __u8 poll;
+ __u8 special_mask;
+ __u8 init_state;
+ __u8 auto_eoi;
+ __u8 rotate_on_auto_eoi;
+ __u8 special_fully_nested_mode;
+ __u8 init4; /* true if 4 byte init */
+ __u8 elcr; /* PIIX edge/trigger selection */
+ __u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS 24
+struct kvm_ioapic_state {
+ __u64 base_address;
+ __u32 ioregsel;
+ __u32 id;
+ __u32 irr;
+ __u32 pad;
+ union {
+ __u64 bits;
+ struct {
+ __u8 vector;
+ __u8 delivery_mode:3;
+ __u8 dest_mode:1;
+ __u8 delivery_status:1;
+ __u8 polarity:1;
+ __u8 remote_irr:1;
+ __u8 trig_mode:1;
+ __u8 mask:1;
+ __u8 reserve:7;
+ __u8 reserved[4];
+ __u8 dest_id;
+ } fields;
+ } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER 0
+#define KVM_IRQCHIP_PIC_SLAVE 1
+#define KVM_IRQCHIP_IOAPIC 2
+#define KVM_NR_IRQCHIPS 3
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+ __u64 rax, rbx, rcx, rdx;
+ __u64 rsi, rdi, rsp, rbp;
+ __u64 r8, r9, r10, r11;
+ __u64 r12, r13, r14, r15;
+ __u64 rip, rflags;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+ char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+ __u64 base;
+ __u32 limit;
+ __u16 selector;
+ __u8 type;
+ __u8 present, dpl, db, s, l, g, avl;
+ __u8 unusable;
+ __u8 padding;
+};
+
+struct kvm_dtable {
+ __u64 base;
+ __u16 limit;
+ __u16 padding[3];
+};
+
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+ /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+ __u8 fpr[8][16];
+ __u16 fcw;
+ __u16 fsw;
+ __u8 ftwx; /* in fxsave format */
+ __u8 pad1;
+ __u16 last_opcode;
+ __u64 last_ip;
+ __u64 last_dp;
+ __u8 xmm[16][16];
+ __u32 mxcsr;
+ __u32 pad2;
+};
+
+struct kvm_msr_entry {
+ __u32 index;
+ __u32 reserved;
+ __u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 pad;
+
+ struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 indices[0];
+};
+
+
+struct kvm_cpuid_entry {
+ __u32 function;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry entries[0];
+};
+
+struct kvm_cpuid_entry2 {
+ __u32 function;
+ __u32 index;
+ __u32 flags;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding[3];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
+#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
+
+/* for KVM_SET_CPUID2 */
+struct kvm_cpuid2 {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry2 entries[0];
+};
+
+/* for KVM_GET_PIT and KVM_SET_PIT */
+struct kvm_pit_channel_state {
+ __u32 count; /* can be 65536 */
+ __u16 latched_count;
+ __u8 count_latched;
+ __u8 status_latched;
+ __u8 status;
+ __u8 read_state;
+ __u8 write_state;
+ __u8 write_latch;
+ __u8 rw_mode;
+ __u8 mode;
+ __u8 bcd;
+ __u8 gate;
+ __s64 count_load_time;
+};
+
+struct kvm_debug_exit_arch {
+ __u32 exception;
+ __u32 pad;
+ __u64 pc;
+ __u64 dr6;
+ __u64 dr7;
+};
+
+#define KVM_GUESTDBG_USE_SW_BP 0x00010000
+#define KVM_GUESTDBG_USE_HW_BP 0x00020000
+#define KVM_GUESTDBG_INJECT_DB 0x00040000
+#define KVM_GUESTDBG_INJECT_BP 0x00080000
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+ __u64 debugreg[8];
+};
+
+struct kvm_pit_state {
+ struct kvm_pit_channel_state channels[3];
+};
+
+#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
+
+struct kvm_pit_state2 {
+ struct kvm_pit_channel_state channels[3];
+ __u32 flags;
+ __u32 reserved[9];
+};
+
+struct kvm_reinject_control {
+ __u8 pit_reinject;
+ __u8 reserved[31];
+};
+
+/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
+#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
+#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
+#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
+
+/* Interrupt shadow states */
+#define KVM_X86_SHADOW_INT_MOV_SS 0x01
+#define KVM_X86_SHADOW_INT_STI 0x02
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+ struct {
+ __u8 injected;
+ __u8 nr;
+ __u8 has_error_code;
+ __u8 pad;
+ __u32 error_code;
+ } exception;
+ struct {
+ __u8 injected;
+ __u8 nr;
+ __u8 soft;
+ __u8 shadow;
+ } interrupt;
+ struct {
+ __u8 injected;
+ __u8 pending;
+ __u8 masked;
+ __u8 pad;
+ } nmi;
+ __u32 sipi_vector;
+ __u32 flags;
+ __u32 reserved[10];
+};
+
+/* for KVM_GET/SET_DEBUGREGS */
+struct kvm_debugregs {
+ __u64 db[4];
+ __u64 dr6;
+ __u64 dr7;
+ __u64 flags;
+ __u64 reserved[9];
+};
+
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+ __u32 region[1024];
+};
+
+#define KVM_MAX_XCRS 16
+
+struct kvm_xcr {
+ __u32 xcr;
+ __u32 reserved;
+ __u64 value;
+};
+
+struct kvm_xcrs {
+ __u32 nr_xcrs;
+ __u32 flags;
+ struct kvm_xcr xcrs[KVM_MAX_XCRS];
+ __u64 padding[16];
+};
+
+#endif /* _ASM_X86_KVM_H */
diff --git a/linux/usr/include/asm-x86/kvm_para.h b/linux/usr/include/asm-x86/kvm_para.h
new file mode 100644
index 0000000..834d71e
--- /dev/null
+++ b/linux/usr/include/asm-x86/kvm_para.h
@@ -0,0 +1,79 @@
+#ifndef _ASM_X86_KVM_PARA_H
+#define _ASM_X86_KVM_PARA_H
+
+#include <linux/types.h>
+#include <asm/hyperv.h>
+
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
+ * should be used to determine that a VM is running under KVM.
+ */
+#define KVM_CPUID_SIGNATURE 0x40000000
+
+/* This CPUID returns a feature bitmap in eax. Before enabling a particular
+ * paravirtualization, the appropriate feature bit should be checked.
+ */
+#define KVM_CPUID_FEATURES 0x40000001
+#define KVM_FEATURE_CLOCKSOURCE 0
+#define KVM_FEATURE_NOP_IO_DELAY 1
+#define KVM_FEATURE_MMU_OP 2
+/* This indicates that the new set of kvmclock msrs
+ * are available. The use of 0x11 and 0x12 is deprecated
+ */
+#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_ASYNC_PF 4
+
+/* The last 8 bits are used to indicate how to interpret the flags field
+ * in pvclock structure. If no bits are set, all flags are ignored.
+ */
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
+
+#define MSR_KVM_WALL_CLOCK 0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+
+/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
+#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
+#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+
+#define KVM_MAX_MMU_OP_BATCH 32
+
+#define KVM_ASYNC_PF_ENABLED (1 << 0)
+#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
+
+/* Operations for KVM_HC_MMU_OP */
+#define KVM_MMU_OP_WRITE_PTE 1
+#define KVM_MMU_OP_FLUSH_TLB 2
+#define KVM_MMU_OP_RELEASE_PT 3
+
+/* Payload for KVM_HC_MMU_OP */
+struct kvm_mmu_op_header {
+ __u32 op;
+ __u32 pad;
+};
+
+struct kvm_mmu_op_write_pte {
+ struct kvm_mmu_op_header header;
+ __u64 pte_phys;
+ __u64 pte_val;
+};
+
+struct kvm_mmu_op_flush_tlb {
+ struct kvm_mmu_op_header header;
+};
+
+struct kvm_mmu_op_release_pt {
+ struct kvm_mmu_op_header header;
+ __u64 pt_phys;
+};
+
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
+struct kvm_vcpu_pv_apf_data {
+ __u32 reason;
+ __u8 pad[60];
+ __u32 enabled;
+};
+
+
+#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/linux/usr/include/linux/kvm.h b/linux/usr/include/linux/kvm.h
new file mode 100644
index 0000000..27efcfb
--- /dev/null
+++ b/linux/usr/include/linux/kvm.h
@@ -0,0 +1,798 @@
+#ifndef __LINUX_KVM_H
+#define __LINUX_KVM_H
+
+/*
+ * Userspace interface for /dev/kvm - kernel based virtual machine
+ *
+ * Note: you must update KVM_API_VERSION if you change this interface.
+ */
+
+#include <linux/types.h>
+
+#include <linux/ioctl.h>
+#include <asm/kvm.h>
+
+#define KVM_API_VERSION 12
+
+/* *** Deprecated interfaces *** */
+
+#define KVM_TRC_SHIFT 16
+
+#define KVM_TRC_ENTRYEXIT (1 << KVM_TRC_SHIFT)
+#define KVM_TRC_HANDLER (1 << (KVM_TRC_SHIFT + 1))
+
+#define KVM_TRC_VMENTRY (KVM_TRC_ENTRYEXIT + 0x01)
+#define KVM_TRC_VMEXIT (KVM_TRC_ENTRYEXIT + 0x02)
+#define KVM_TRC_PAGE_FAULT (KVM_TRC_HANDLER + 0x01)
+
+#define KVM_TRC_HEAD_SIZE 12
+#define KVM_TRC_CYCLE_SIZE 8
+#define KVM_TRC_EXTRA_MAX 7
+
+#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02)
+#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03)
+#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04)
+#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05)
+#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06)
+#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07)
+#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08)
+#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09)
+#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A)
+#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B)
+#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C)
+#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D)
+#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E)
+#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F)
+#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10)
+#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11)
+#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12)
+#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13)
+#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14)
+#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15)
+#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16)
+#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17)
+#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18)
+#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19)
+
+struct kvm_user_trace_setup {
+ __u32 buf_size;
+ __u32 buf_nr;
+};
+
+#define __KVM_DEPRECATED_MAIN_W_0x06 \
+ _IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
+#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07)
+#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08)
+
+#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq)
+
+struct kvm_breakpoint {
+ __u32 enabled;
+ __u32 padding;
+ __u64 address;
+};
+
+struct kvm_debug_guest {
+ __u32 enabled;
+ __u32 pad;
+ struct kvm_breakpoint breakpoints[4];
+ __u32 singlestep;
+};
+
+#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest)
+
+/* *** End of deprecated interfaces *** */
+
+
+/* for KVM_CREATE_MEMORY_REGION */
+struct kvm_memory_region {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+};
+
+/* for KVM_SET_USER_MEMORY_REGION */
+struct kvm_userspace_memory_region {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+ __u64 userspace_addr; /* start of the userspace allocated memory */
+};
+
+/* for kvm_memory_region::flags */
+#define KVM_MEM_LOG_DIRTY_PAGES 1UL
+#define KVM_MEMSLOT_INVALID (1UL << 1)
+
+/* for KVM_IRQ_LINE */
+struct kvm_irq_level {
+ /*
+ * ACPI gsi notion of irq.
+ * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
+ * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
+ */
+ union {
+ __u32 irq;
+ __s32 status;
+ };
+ __u32 level;
+};
+
+
+struct kvm_irqchip {
+ __u32 chip_id;
+ __u32 pad;
+ union {
+ char dummy[512]; /* reserving space */
+#ifdef __KVM_HAVE_PIT
+ struct kvm_pic_state pic;
+#endif
+#ifdef __KVM_HAVE_IOAPIC
+ struct kvm_ioapic_state ioapic;
+#endif
+ } chip;
+};
+
+/* for KVM_CREATE_PIT2 */
+struct kvm_pit_config {
+ __u32 flags;
+ __u32 pad[15];
+};
+
+#define KVM_PIT_SPEAKER_DUMMY 1
+
+#define KVM_EXIT_UNKNOWN 0
+#define KVM_EXIT_EXCEPTION 1
+#define KVM_EXIT_IO 2
+#define KVM_EXIT_HYPERCALL 3
+#define KVM_EXIT_DEBUG 4
+#define KVM_EXIT_HLT 5
+#define KVM_EXIT_MMIO 6
+#define KVM_EXIT_IRQ_WINDOW_OPEN 7
+#define KVM_EXIT_SHUTDOWN 8
+#define KVM_EXIT_FAIL_ENTRY 9
+#define KVM_EXIT_INTR 10
+#define KVM_EXIT_SET_TPR 11
+#define KVM_EXIT_TPR_ACCESS 12
+#define KVM_EXIT_S390_SIEIC 13
+#define KVM_EXIT_S390_RESET 14
+#define KVM_EXIT_DCR 15
+#define KVM_EXIT_NMI 16
+#define KVM_EXIT_INTERNAL_ERROR 17
+#define KVM_EXIT_OSI 18
+
+/* For KVM_EXIT_INTERNAL_ERROR */
+#define KVM_INTERNAL_ERROR_EMULATION 1
+#define KVM_INTERNAL_ERROR_SIMUL_EX 2
+
+/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
+struct kvm_run {
+ /* in */
+ __u8 request_interrupt_window;
+ __u8 padding1[7];
+
+ /* out */
+ __u32 exit_reason;
+ __u8 ready_for_interrupt_injection;
+ __u8 if_flag;
+ __u8 padding2[2];
+
+ /* in (pre_kvm_run), out (post_kvm_run) */
+ __u64 cr8;
+ __u64 apic_base;
+
+#ifdef __KVM_S390
+ /* the processor status word for s390 */
+ __u64 psw_mask; /* psw upper half */
+ __u64 psw_addr; /* psw lower half */
+#endif
+ union {
+ /* KVM_EXIT_UNKNOWN */
+ struct {
+ __u64 hardware_exit_reason;
+ } hw;
+ /* KVM_EXIT_FAIL_ENTRY */
+ struct {
+ __u64 hardware_entry_failure_reason;
+ } fail_entry;
+ /* KVM_EXIT_EXCEPTION */
+ struct {
+ __u32 exception;
+ __u32 error_code;
+ } ex;
+ /* KVM_EXIT_IO */
+ struct {
+#define KVM_EXIT_IO_IN 0
+#define KVM_EXIT_IO_OUT 1
+ __u8 direction;
+ __u8 size; /* bytes */
+ __u16 port;
+ __u32 count;
+ __u64 data_offset; /* relative to kvm_run start */
+ } io;
+ struct {
+ struct kvm_debug_exit_arch arch;
+ } debug;
+ /* KVM_EXIT_MMIO */
+ struct {
+ __u64 phys_addr;
+ __u8 data[8];
+ __u32 len;
+ __u8 is_write;
+ } mmio;
+ /* KVM_EXIT_HYPERCALL */
+ struct {
+ __u64 nr;
+ __u64 args[6];
+ __u64 ret;
+ __u32 longmode;
+ __u32 pad;
+ } hypercall;
+ /* KVM_EXIT_TPR_ACCESS */
+ struct {
+ __u64 rip;
+ __u32 is_write;
+ __u32 pad;
+ } tpr_access;
+ /* KVM_EXIT_S390_SIEIC */
+ struct {
+ __u8 icptcode;
+ __u16 ipa;
+ __u32 ipb;
+ } s390_sieic;
+ /* KVM_EXIT_S390_RESET */
+#define KVM_S390_RESET_POR 1
+#define KVM_S390_RESET_CLEAR 2
+#define KVM_S390_RESET_SUBSYSTEM 4
+#define KVM_S390_RESET_CPU_INIT 8
+#define KVM_S390_RESET_IPL 16
+ __u64 s390_reset_flags;
+ /* KVM_EXIT_DCR */
+ struct {
+ __u32 dcrn;
+ __u32 data;
+ __u8 is_write;
+ } dcr;
+ struct {
+ __u32 suberror;
+ /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
+ __u32 ndata;
+ __u64 data[16];
+ } internal;
+ /* KVM_EXIT_OSI */
+ struct {
+ __u64 gprs[32];
+ } osi;
+ /* Fix the size of the union. */
+ char padding[256];
+ };
+};
+
+/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
+
+struct kvm_coalesced_mmio_zone {
+ __u64 addr;
+ __u32 size;
+ __u32 pad;
+};
+
+struct kvm_coalesced_mmio {
+ __u64 phys_addr;
+ __u32 len;
+ __u32 pad;
+ __u8 data[8];
+};
+
+struct kvm_coalesced_mmio_ring {
+ __u32 first, last;
+ struct kvm_coalesced_mmio coalesced_mmio[0];
+};
+
+#define KVM_COALESCED_MMIO_MAX \
+ ((PAGE_SIZE - sizeof(struct kvm_coalesced_mmio_ring)) / \
+ sizeof(struct kvm_coalesced_mmio))
+
+/* for KVM_TRANSLATE */
+struct kvm_translation {
+ /* in */
+ __u64 linear_address;
+
+ /* out */
+ __u64 physical_address;
+ __u8 valid;
+ __u8 writeable;
+ __u8 usermode;
+ __u8 pad[5];
+};
+
+/* for KVM_INTERRUPT */
+struct kvm_interrupt {
+ /* in */
+ __u32 irq;
+};
+
+/* for KVM_GET_DIRTY_LOG */
+struct kvm_dirty_log {
+ __u32 slot;
+ __u32 padding1;
+ union {
+ void *dirty_bitmap; /* one bit per page */
+ __u64 padding2;
+ };
+};
+
+/* for KVM_SET_SIGNAL_MASK */
+struct kvm_signal_mask {
+ __u32 len;
+ __u8 sigset[0];
+};
+
+/* for KVM_TPR_ACCESS_REPORTING */
+struct kvm_tpr_access_ctl {
+ __u32 enabled;
+ __u32 flags;
+ __u32 reserved[8];
+};
+
+/* for KVM_SET_VAPIC_ADDR */
+struct kvm_vapic_addr {
+ __u64 vapic_addr;
+};
+
+/* for KVM_SET_MPSTATE */
+
+#define KVM_MP_STATE_RUNNABLE 0
+#define KVM_MP_STATE_UNINITIALIZED 1
+#define KVM_MP_STATE_INIT_RECEIVED 2
+#define KVM_MP_STATE_HALTED 3
+#define KVM_MP_STATE_SIPI_RECEIVED 4
+
+struct kvm_mp_state {
+ __u32 mp_state;
+};
+
+struct kvm_s390_psw {
+ __u64 mask;
+ __u64 addr;
+};
+
+/* valid values for type in kvm_s390_interrupt */
+#define KVM_S390_SIGP_STOP 0xfffe0000u
+#define KVM_S390_PROGRAM_INT 0xfffe0001u
+#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u
+#define KVM_S390_RESTART 0xfffe0003u
+#define KVM_S390_INT_VIRTIO 0xffff2603u
+#define KVM_S390_INT_SERVICE 0xffff2401u
+#define KVM_S390_INT_EMERGENCY 0xffff1201u
+
+struct kvm_s390_interrupt {
+ __u32 type;
+ __u32 parm;
+ __u64 parm64;
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+
+#define KVM_GUESTDBG_ENABLE 0x00000001
+#define KVM_GUESTDBG_SINGLESTEP 0x00000002
+
+struct kvm_guest_debug {
+ __u32 control;
+ __u32 pad;
+ struct kvm_guest_debug_arch arch;
+};
+
+enum {
+ kvm_ioeventfd_flag_nr_datamatch,
+ kvm_ioeventfd_flag_nr_pio,
+ kvm_ioeventfd_flag_nr_deassign,
+ kvm_ioeventfd_flag_nr_max,
+};
+
+#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
+#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio)
+#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
+
+#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
+
+struct kvm_ioeventfd {
+ __u64 datamatch;
+ __u64 addr; /* legal pio/mmio address */
+ __u32 len; /* 1, 2, 4, or 8 bytes */
+ __s32 fd;
+ __u32 flags;
+ __u8 pad[36];
+};
+
+/* for KVM_ENABLE_CAP */
+struct kvm_enable_cap {
+ /* in */
+ __u32 cap;
+ __u32 flags;
+ __u64 args[4];
+ __u8 pad[64];
+};
+
+/* for KVM_PPC_GET_PVINFO */
+struct kvm_ppc_pvinfo {
+ /* out */
+ __u32 flags;
+ __u32 hcall[4];
+ __u8 pad[108];
+};
+
+#define KVMIO 0xAE
+
+/*
+ * ioctls for /dev/kvm fds:
+ */
+#define KVM_GET_API_VERSION _IO(KVMIO, 0x00)
+#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */
+#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 0x02, struct kvm_msr_list)
+
+#define KVM_S390_ENABLE_SIE _IO(KVMIO, 0x06)
+/*
+ * Check if a kvm extension is available. Argument is extension number,
+ * return is 1 (yes) or 0 (no, sorry).
+ */
+#define KVM_CHECK_EXTENSION _IO(KVMIO, 0x03)
+/*
+ * Get size for mmap(vcpu_fd)
+ */
+#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */
+#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid2)
+#define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06
+#define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07
+#define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08
+
+/*
+ * Extension capability list.
+ */
+#define KVM_CAP_IRQCHIP 0
+#define KVM_CAP_HLT 1
+#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
+#define KVM_CAP_USER_MEMORY 3
+#define KVM_CAP_SET_TSS_ADDR 4
+#define KVM_CAP_VAPIC 6
+#define KVM_CAP_EXT_CPUID 7
+#define KVM_CAP_CLOCKSOURCE 8
+#define KVM_CAP_NR_VCPUS 9 /* returns max vcpus per vm */
+#define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */
+#define KVM_CAP_PIT 11
+#define KVM_CAP_NOP_IO_DELAY 12
+#define KVM_CAP_PV_MMU 13
+#define KVM_CAP_MP_STATE 14
+#define KVM_CAP_COALESCED_MMIO 15
+#define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+#define KVM_CAP_DEVICE_ASSIGNMENT 17
+#endif
+#define KVM_CAP_IOMMU 18
+#ifdef __KVM_HAVE_MSI
+#define KVM_CAP_DEVICE_MSI 20
+#endif
+/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
+#ifdef __KVM_HAVE_USER_NMI
+#define KVM_CAP_USER_NMI 22
+#endif
+#ifdef __KVM_HAVE_GUEST_DEBUG
+#define KVM_CAP_SET_GUEST_DEBUG 23
+#endif
+#ifdef __KVM_HAVE_PIT
+#define KVM_CAP_REINJECT_CONTROL 24
+#endif
+#ifdef __KVM_HAVE_IOAPIC
+#define KVM_CAP_IRQ_ROUTING 25
+#endif
+#define KVM_CAP_IRQ_INJECT_STATUS 26
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+#define KVM_CAP_DEVICE_DEASSIGNMENT 27
+#endif
+#ifdef __KVM_HAVE_MSIX
+#define KVM_CAP_DEVICE_MSIX 28
+#endif
+#define KVM_CAP_ASSIGN_DEV_IRQ 29
+/* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
+#ifdef __KVM_HAVE_MCE
+#define KVM_CAP_MCE 31
+#endif
+#define KVM_CAP_IRQFD 32
+#ifdef __KVM_HAVE_PIT
+#define KVM_CAP_PIT2 33
+#endif
+#define KVM_CAP_SET_BOOT_CPU_ID 34
+#ifdef __KVM_HAVE_PIT_STATE2
+#define KVM_CAP_PIT_STATE2 35
+#endif
+#define KVM_CAP_IOEVENTFD 36
+#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
+#ifdef __KVM_HAVE_XEN_HVM
+#define KVM_CAP_XEN_HVM 38
+#endif
+#define KVM_CAP_ADJUST_CLOCK 39
+#define KVM_CAP_INTERNAL_ERROR_DATA 40
+#ifdef __KVM_HAVE_VCPU_EVENTS
+#define KVM_CAP_VCPU_EVENTS 41
+#endif
+#define KVM_CAP_S390_PSW 42
+#define KVM_CAP_PPC_SEGSTATE 43
+#define KVM_CAP_HYPERV 44
+#define KVM_CAP_HYPERV_VAPIC 45
+#define KVM_CAP_HYPERV_SPIN 46
+#define KVM_CAP_PCI_SEGMENT 47
+#define KVM_CAP_PPC_PAIRED_SINGLES 48
+#define KVM_CAP_INTR_SHADOW 49
+#ifdef __KVM_HAVE_DEBUGREGS
+#define KVM_CAP_DEBUGREGS 50
+#endif
+#define KVM_CAP_X86_ROBUST_SINGLESTEP 51
+#define KVM_CAP_PPC_OSI 52
+#define KVM_CAP_PPC_UNSET_IRQ 53
+#define KVM_CAP_ENABLE_CAP 54
+#ifdef __KVM_HAVE_XSAVE
+#define KVM_CAP_XSAVE 55
+#endif
+#ifdef __KVM_HAVE_XCRS
+#define KVM_CAP_XCRS 56
+#endif
+#define KVM_CAP_PPC_GET_PVINFO 57
+#define KVM_CAP_PPC_IRQ_LEVEL 58
+#define KVM_CAP_ASYNC_PF 59
+
+#ifdef KVM_CAP_IRQ_ROUTING
+
+struct kvm_irq_routing_irqchip {
+ __u32 irqchip;
+ __u32 pin;
+};
+
+struct kvm_irq_routing_msi {
+ __u32 address_lo;
+ __u32 address_hi;
+ __u32 data;
+ __u32 pad;
+};
+
+/* gsi routing entry types */
+#define KVM_IRQ_ROUTING_IRQCHIP 1
+#define KVM_IRQ_ROUTING_MSI 2
+
+struct kvm_irq_routing_entry {
+ __u32 gsi;
+ __u32 type;
+ __u32 flags;
+ __u32 pad;
+ union {
+ struct kvm_irq_routing_irqchip irqchip;
+ struct kvm_irq_routing_msi msi;
+ __u32 pad[8];
+ } u;
+};
+
+struct kvm_irq_routing {
+ __u32 nr;
+ __u32 flags;
+ struct kvm_irq_routing_entry entries[0];
+};
+
+#endif
+
+#ifdef KVM_CAP_MCE
+/* x86 MCE */
+struct kvm_x86_mce {
+ __u64 status;
+ __u64 addr;
+ __u64 misc;
+ __u64 mcg_status;
+ __u8 bank;
+ __u8 pad1[7];
+ __u64 pad2[3];
+};
+#endif
+
+#ifdef KVM_CAP_XEN_HVM
+struct kvm_xen_hvm_config {
+ __u32 flags;
+ __u32 msr;
+ __u64 blob_addr_32;
+ __u64 blob_addr_64;
+ __u8 blob_size_32;
+ __u8 blob_size_64;
+ __u8 pad2[30];
+};
+#endif
+
+#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
+
+struct kvm_irqfd {
+ __u32 fd;
+ __u32 gsi;
+ __u32 flags;
+ __u8 pad[20];
+};
+
+struct kvm_clock_data {
+ __u64 clock;
+ __u32 flags;
+ __u32 pad[9];
+};
+
+/*
+ * ioctls for VM fds
+ */
+#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
+/*
+ * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
+ * a vcpu fd.
+ */
+#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
+#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
+/* KVM_SET_MEMORY_ALIAS is obsolete: */
+#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
+#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
+#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
+ struct kvm_userspace_memory_region)
+#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
+#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
+/* Device model IOC */
+#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
+#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
+#define KVM_GET_IRQCHIP _IOWR(KVMIO, 0x62, struct kvm_irqchip)
+#define KVM_SET_IRQCHIP _IOR(KVMIO, 0x63, struct kvm_irqchip)
+#define KVM_CREATE_PIT _IO(KVMIO, 0x64)
+#define KVM_GET_PIT _IOWR(KVMIO, 0x65, struct kvm_pit_state)
+#define KVM_SET_PIT _IOR(KVMIO, 0x66, struct kvm_pit_state)
+#define KVM_IRQ_LINE_STATUS _IOWR(KVMIO, 0x67, struct kvm_irq_level)
+#define KVM_REGISTER_COALESCED_MMIO \
+ _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone)
+#define KVM_UNREGISTER_COALESCED_MMIO \
+ _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone)
+#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
+ struct kvm_assigned_pci_dev)
+#define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
+/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
+#define KVM_ASSIGN_IRQ __KVM_DEPRECATED_VM_R_0x70
+#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq)
+#define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71)
+#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
+ struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_SET_MSIX_NR _IOW(KVMIO, 0x73, \
+ struct kvm_assigned_msix_nr)
+#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO, 0x74, \
+ struct kvm_assigned_msix_entry)
+#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
+#define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd)
+#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config)
+#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78)
+#define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
+#define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config)
+#define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data)
+#define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_PIT_STATE2 */
+#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2)
+#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
+/* Available with KVM_CAP_PPC_GET_PVINFO */
+#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo)
+
+/*
+ * ioctls for vcpu fds
+ */
+#define KVM_RUN _IO(KVMIO, 0x80)
+#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs)
+#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs)
+#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs)
+#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs)
+#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation)
+#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt)
+/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */
+#define KVM_DEBUG_GUEST __KVM_DEPRECATED_VCPU_W_0x87
+#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs)
+#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs)
+#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid)
+#define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask)
+#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu)
+#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
+#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state)
+#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state)
+#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2)
+#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr)
+/* valid for virtual machine (for floating interrupt)_and_ vcpu */
+#define KVM_S390_INTERRUPT _IOW(KVMIO, 0x94, struct kvm_s390_interrupt)
+/* store status for s390 */
+#define KVM_S390_STORE_STATUS_NOADDR (-1ul)
+#define KVM_S390_STORE_STATUS_PREFIXED (-2ul)
+#define KVM_S390_STORE_STATUS _IOW(KVMIO, 0x95, unsigned long)
+/* initial ipl psw for s390 */
+#define KVM_S390_SET_INITIAL_PSW _IOW(KVMIO, 0x96, struct kvm_s390_psw)
+/* initial reset for s390 */
+#define KVM_S390_INITIAL_RESET _IO(KVMIO, 0x97)
+#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state)
+#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state)
+/* Available with KVM_CAP_NMI */
+#define KVM_NMI _IO(KVMIO, 0x9a)
+/* Available with KVM_CAP_SET_GUEST_DEBUG */
+#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug)
+/* MCE for x86 */
+#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64)
+#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64)
+#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce)
+/* IA64 stack access */
+#define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *)
+#define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *)
+/* Available with KVM_CAP_VCPU_EVENTS */
+#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events)
+#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events)
+/* Available with KVM_CAP_DEBUGREGS */
+#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs)
+#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs)
+#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap)
+/* Available with KVM_CAP_XSAVE */
+#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave)
+#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave)
+/* Available with KVM_CAP_XCRS */
+#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs)
+#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
+
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
+
+struct kvm_assigned_pci_dev {
+ __u32 assigned_dev_id;
+ __u32 busnr;
+ __u32 devfn;
+ __u32 flags;
+ __u32 segnr;
+ union {
+ __u32 reserved[11];
+ };
+};
+
+#define KVM_DEV_IRQ_HOST_INTX (1 << 0)
+#define KVM_DEV_IRQ_HOST_MSI (1 << 1)
+#define KVM_DEV_IRQ_HOST_MSIX (1 << 2)
+
+#define KVM_DEV_IRQ_GUEST_INTX (1 << 8)
+#define KVM_DEV_IRQ_GUEST_MSI (1 << 9)
+#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10)
+
+#define KVM_DEV_IRQ_HOST_MASK 0x00ff
+#define KVM_DEV_IRQ_GUEST_MASK 0xff00
+
+struct kvm_assigned_irq {
+ __u32 assigned_dev_id;
+ __u32 host_irq;
+ __u32 guest_irq;
+ __u32 flags;
+ union {
+ struct {
+ __u32 addr_lo;
+ __u32 addr_hi;
+ __u32 data;
+ } guest_msi;
+ __u32 reserved[12];
+ };
+};
+
+
+struct kvm_assigned_msix_nr {
+ __u32 assigned_dev_id;
+ __u16 entry_nr;
+ __u16 padding;
+};
+
+#define KVM_MAX_MSIX_PER_DEV 256
+struct kvm_assigned_msix_entry {
+ __u32 assigned_dev_id;
+ __u32 gsi;
+ __u16 entry; /* The index of entry in the MSI-X table */
+ __u16 padding[3];
+};
+
+#endif /* __LINUX_KVM_H */
diff --git a/linux/usr/include/linux/kvm_para.h b/linux/usr/include/linux/kvm_para.h
new file mode 100644
index 0000000..b315e27
--- /dev/null
+++ b/linux/usr/include/linux/kvm_para.h
@@ -0,0 +1,29 @@
+#ifndef __LINUX_KVM_PARA_H
+#define __LINUX_KVM_PARA_H
+
+/*
+ * This header file provides a method for making a hypercall to the host
+ * Architectures should define:
+ * - kvm_hypercall0, kvm_hypercall1...
+ * - kvm_arch_para_features
+ * - kvm_para_available
+ */
+
+/* Return values for hypercalls */
+#define KVM_ENOSYS 1000
+#define KVM_EFAULT EFAULT
+#define KVM_E2BIG E2BIG
+#define KVM_EPERM EPERM
+
+#define KVM_HC_VAPIC_POLL_IRQ 1
+#define KVM_HC_MMU_OP 2
+#define KVM_HC_FEATURES 3
+#define KVM_HC_PPC_MAP_MAGIC_PAGE 4
+
+/*
+ * hypercalls use architecture specific
+ */
+#include <asm/kvm_para.h>
+
+#endif /* __LINUX_KVM_PARA_H */
+
diff --git a/linux/x86/assigned-dev.c b/linux/x86/assigned-dev.c
new file mode 100644
index 0000000..86dac05
--- /dev/null
+++ b/linux/x86/assigned-dev.c
@@ -0,0 +1,827 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * Kernel-based Virtual Machine - device assignment support
+ *
+ * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include "irq.h"
+
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+ int assigned_dev_id)
+{
+ struct list_head *ptr;
+ struct kvm_assigned_dev_kernel *match;
+
+ list_for_each(ptr, head) {
+ match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+ if (match->assigned_dev_id == assigned_dev_id)
+ return match;
+ }
+ return NULL;
+}
+
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+ *assigned_dev, int irq)
+{
+ int i, index;
+ struct msix_entry *host_msix_entries;
+
+ host_msix_entries = assigned_dev->host_msix_entries;
+
+ index = -1;
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ if (irq == host_msix_entries[i].vector) {
+ index = i;
+ break;
+ }
+ if (index < 0) {
+ printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+ return 0;
+ }
+
+ return index;
+}
+
+static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ u32 vector;
+ int index;
+
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
+ spin_lock(&assigned_dev->intx_lock);
+ disable_irq_nosync(irq);
+ assigned_dev->host_irq_disabled = true;
+ spin_unlock(&assigned_dev->intx_lock);
+ }
+
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+ index = find_index_from_host_irq(assigned_dev, irq);
+ if (index >= 0) {
+ vector = assigned_dev->
+ guest_msix_entries[index].vector;
+ kvm_set_irq(assigned_dev->kvm,
+ assigned_dev->irq_source_id, vector, 1);
+ }
+ } else
+ kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+ assigned_dev->guest_irq, 1);
+
+ return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_assigned_dev_kernel *dev;
+
+ if (kian->gsi == -1)
+ return;
+
+ dev = container_of(kian, struct kvm_assigned_dev_kernel,
+ ack_notifier);
+
+ kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
+
+ /* The guest irq may be shared so this ack may be
+ * from another device.
+ */
+ spin_lock(&dev->intx_lock);
+ if (dev->host_irq_disabled) {
+ enable_irq(dev->host_irq);
+ dev->host_irq_disabled = false;
+ }
+ spin_unlock(&dev->intx_lock);
+}
+
+static void deassign_guest_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+ assigned_dev->ack_notifier.gsi = -1;
+
+ kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+ assigned_dev->guest_irq, 0);
+
+ if (assigned_dev->irq_source_id != -1)
+ kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+ assigned_dev->irq_source_id = -1;
+ assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
+
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ /*
+ * We disable irq here to prevent further events.
+ *
+ * Notice this maybe result in nested disable if the interrupt type is
+ * INTx, but it's OK for we are going to free it.
+ *
+ * If this function is a part of VM destroy, please ensure that till
+ * now, the kvm state is still legal for probably we also have to wait
+ * on a currently running IRQ handler.
+ */
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+ int i;
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ disable_irq(assigned_dev->host_msix_entries[i].vector);
+
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ free_irq(assigned_dev->host_msix_entries[i].vector,
+ (void *)assigned_dev);
+
+ assigned_dev->entries_nr = 0;
+ kfree(assigned_dev->host_msix_entries);
+ kfree(assigned_dev->guest_msix_entries);
+ pci_disable_msix(assigned_dev->dev);
+ } else {
+ /* Deal with MSI and INTx */
+ disable_irq(assigned_dev->host_irq);
+
+ free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
+ pci_disable_msi(assigned_dev->dev);
+ }
+
+ assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev,
+ unsigned long irq_requested_type)
+{
+ unsigned long guest_irq_type, host_irq_type;
+
+ if (!irqchip_in_kernel(kvm))
+ return -EINVAL;
+ /* no irq assignment to deassign */
+ if (!assigned_dev->irq_requested_type)
+ return -ENXIO;
+
+ host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+ guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+ if (host_irq_type)
+ deassign_host_irq(kvm, assigned_dev);
+ if (guest_irq_type)
+ deassign_guest_irq(kvm, assigned_dev);
+
+ return 0;
+}
+
+static void kvm_free_assigned_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel
+ *assigned_dev)
+{
+ kvm_free_assigned_irq(kvm, assigned_dev);
+
+ __pci_reset_function(assigned_dev->dev);
+ pci_restore_state(assigned_dev->dev);
+
+ pci_release_regions(assigned_dev->dev);
+ pci_disable_device(assigned_dev->dev);
+ pci_dev_put(assigned_dev->dev);
+
+ list_del(&assigned_dev->list);
+ kfree(assigned_dev);
+}
+
+void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+ struct list_head *ptr, *ptr2;
+ struct kvm_assigned_dev_kernel *assigned_dev;
+
+ list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+ assigned_dev = list_entry(ptr,
+ struct kvm_assigned_dev_kernel,
+ list);
+
+ kvm_free_assigned_device(kvm, assigned_dev);
+ }
+}
+
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ dev->host_irq = dev->dev->irq;
+ /* Even though this is PCI, we don't want to use shared
+ * interrupts. Sharing host devices with guest-assigned devices
+ * on the same interrupt line is not a happy situation: there
+ * are going to be long delays in accepting, acking, etc.
+ */
+ if (kvm_request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
+ IRQF_ONESHOT, dev->irq_name, (void *)dev))
+ return -EIO;
+ return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ int r;
+
+ if (!dev->dev->msi_enabled) {
+ r = pci_enable_msi(dev->dev);
+ if (r)
+ return r;
+ }
+
+ dev->host_irq = dev->dev->irq;
+ if (kvm_request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
+ 0, dev->irq_name, (void *)dev)) {
+ pci_disable_msi(dev->dev);
+ return -EIO;
+ }
+
+ return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ int i, r = -EINVAL;
+
+ /* host_msix_entries and guest_msix_entries should have been
+ * initialized */
+ if (dev->entries_nr == 0)
+ return r;
+
+ r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
+ if (r)
+ return r;
+
+ for (i = 0; i < dev->entries_nr; i++) {
+ r = kvm_request_threaded_irq(dev->host_msix_entries[i].vector,
+ NULL, kvm_assigned_dev_thread,
+ 0, dev->irq_name, (void *)dev);
+ if (r)
+ goto err;
+ }
+
+ return 0;
+err:
+ for (i -= 1; i >= 0; i--)
+ free_irq(dev->host_msix_entries[i].vector, (void *)dev);
+ pci_disable_msix(dev->dev);
+ return r;
+}
+
+#endif
+
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = irq->guest_irq;
+ return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = -1;
+ dev->host_irq_disabled = false;
+ return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = -1;
+ dev->host_irq_disabled = false;
+ return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ __u32 host_irq_type)
+{
+ int r = -EEXIST;
+
+ if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+ return r;
+
+ snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
+ pci_name(dev->dev));
+
+ switch (host_irq_type) {
+ case KVM_DEV_IRQ_HOST_INTX:
+ r = assigned_device_enable_host_intx(kvm, dev);
+ break;
+#ifdef __KVM_HAVE_MSI
+ case KVM_DEV_IRQ_HOST_MSI:
+ r = assigned_device_enable_host_msi(kvm, dev);
+ break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+ case KVM_DEV_IRQ_HOST_MSIX:
+ r = assigned_device_enable_host_msix(kvm, dev);
+ break;
+#endif
+ default:
+ r = -EINVAL;
+ }
+
+ if (!r)
+ dev->irq_requested_type |= host_irq_type;
+
+ return r;
+}
+
+static int assign_guest_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq,
+ unsigned long guest_irq_type)
+{
+ int id;
+ int r = -EEXIST;
+
+ if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+ return r;
+
+ id = kvm_request_irq_source_id(kvm);
+ if (id < 0)
+ return id;
+
+ dev->irq_source_id = id;
+
+ switch (guest_irq_type) {
+ case KVM_DEV_IRQ_GUEST_INTX:
+ r = assigned_device_enable_guest_intx(kvm, dev, irq);
+ break;
+#ifdef __KVM_HAVE_MSI
+ case KVM_DEV_IRQ_GUEST_MSI:
+ r = assigned_device_enable_guest_msi(kvm, dev, irq);
+ break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+ case KVM_DEV_IRQ_GUEST_MSIX:
+ r = assigned_device_enable_guest_msix(kvm, dev, irq);
+ break;
+#endif
+ default:
+ r = -EINVAL;
+ }
+
+ if (!r) {
+ dev->irq_requested_type |= guest_irq_type;
+ kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+ } else
+ kvm_free_irq_source_id(kvm, dev->irq_source_id);
+
+ return r;
+}
+
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+ struct kvm_assigned_irq *assigned_irq)
+{
+ int r = -EINVAL;
+ struct kvm_assigned_dev_kernel *match;
+ unsigned long host_irq_type, guest_irq_type;
+
+ if (!irqchip_in_kernel(kvm))
+ return r;
+
+ mutex_lock(&kvm->lock);
+ r = -ENODEV;
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_irq->assigned_dev_id);
+ if (!match)
+ goto out;
+
+ host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+ guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
+
+ r = -EINVAL;
+ /* can only assign one type at a time */
+ if (hweight_long(host_irq_type) > 1)
+ goto out;
+ if (hweight_long(guest_irq_type) > 1)
+ goto out;
+ if (host_irq_type == 0 && guest_irq_type == 0)
+ goto out;
+
+ r = 0;
+ if (host_irq_type)
+ r = assign_host_irq(kvm, match, host_irq_type);
+ if (r)
+ goto out;
+
+ if (guest_irq_type)
+ r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+ struct kvm_assigned_irq
+ *assigned_irq)
+{
+ int r = -ENODEV;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_irq->assigned_dev_id);
+ if (!match)
+ goto out;
+
+ r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0, idx;
+ struct kvm_assigned_dev_kernel *match;
+ struct pci_dev *dev;
+
+ mutex_lock(&kvm->lock);
+ idx = srcu_read_lock(&kvm->srcu);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (match) {
+ /* device already assigned */
+ r = -EEXIST;
+ goto out;
+ }
+
+ match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+ if (match == NULL) {
+ printk(KERN_INFO "%s: Couldn't allocate memory\n",
+ __func__);
+ r = -ENOMEM;
+ goto out;
+ }
+ dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
+ assigned_dev->busnr,
+ assigned_dev->devfn);
+ if (!dev) {
+ printk(KERN_INFO "%s: host device not found\n", __func__);
+ r = -EINVAL;
+ goto out_free;
+ }
+ if (pci_enable_device(dev)) {
+ printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+ r = -EBUSY;
+ goto out_put;
+ }
+ r = pci_request_regions(dev, "kvm_assigned_device");
+ if (r) {
+ printk(KERN_INFO "%s: Could not get access to device regions\n",
+ __func__);
+ goto out_disable;
+ }
+
+ pci_reset_function(dev);
+ pci_save_state(dev);
+
+ match->assigned_dev_id = assigned_dev->assigned_dev_id;
+ match->host_segnr = assigned_dev->segnr;
+ match->host_busnr = assigned_dev->busnr;
+ match->host_devfn = assigned_dev->devfn;
+ match->flags = assigned_dev->flags;
+ match->dev = dev;
+ spin_lock_init(&match->intx_lock);
+ match->irq_source_id = -1;
+ match->kvm = kvm;
+ match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+
+ list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+ if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
+ if (!kvm->arch.iommu_domain) {
+ r = kvm_iommu_map_guest(kvm);
+ if (r)
+ goto out_list_del;
+ }
+ r = kvm_assign_device(kvm, match);
+ if (r)
+ goto out_list_del;
+ }
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ mutex_unlock(&kvm->lock);
+ return r;
+out_list_del:
+ pci_restore_state(dev);
+ list_del(&match->list);
+ pci_release_regions(dev);
+out_disable:
+ pci_disable_device(dev);
+out_put:
+ pci_dev_put(dev);
+out_free:
+ kfree(match);
+ srcu_read_unlock(&kvm->srcu, idx);
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (!match) {
+ printk(KERN_INFO "%s: device hasn't been assigned before, "
+ "so cannot be deassigned\n", __func__);
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
+ kvm_deassign_device(kvm, match);
+
+ kvm_free_assigned_device(kvm, match);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+ struct kvm_assigned_msix_nr *entry_nr)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *adev;
+
+ mutex_lock(&kvm->lock);
+
+ adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ entry_nr->assigned_dev_id);
+ if (!adev) {
+ r = -EINVAL;
+ goto msix_nr_out;
+ }
+
+ if (adev->entries_nr == 0) {
+ adev->entries_nr = entry_nr->entry_nr;
+ if (adev->entries_nr == 0 ||
+ adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+ r = -EINVAL;
+ goto msix_nr_out;
+ }
+
+ adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+ entry_nr->entry_nr,
+ GFP_KERNEL);
+ if (!adev->host_msix_entries) {
+ r = -ENOMEM;
+ goto msix_nr_out;
+ }
+ adev->guest_msix_entries =
+ kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
+ GFP_KERNEL);
+ if (!adev->guest_msix_entries) {
+ kfree(adev->host_msix_entries);
+ r = -ENOMEM;
+ goto msix_nr_out;
+ }
+ } else /* Not allowed set MSI-X number twice */
+ r = -EINVAL;
+msix_nr_out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+ struct kvm_assigned_msix_entry *entry)
+{
+ int r = 0, i;
+ struct kvm_assigned_dev_kernel *adev;
+
+ mutex_lock(&kvm->lock);
+
+ adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ entry->assigned_dev_id);
+
+ if (!adev) {
+ r = -EINVAL;
+ goto msix_entry_out;
+ }
+
+ for (i = 0; i < adev->entries_nr; i++)
+ if (adev->guest_msix_entries[i].vector == 0 ||
+ adev->guest_msix_entries[i].entry == entry->entry) {
+ adev->guest_msix_entries[i].entry = entry->entry;
+ adev->guest_msix_entries[i].vector = entry->gsi;
+ adev->host_msix_entries[i].entry = entry->entry;
+ break;
+ }
+ if (i == adev->entries_nr) {
+ r = -ENOSPC;
+ goto msix_entry_out;
+ }
+
+msix_entry_out:
+ mutex_unlock(&kvm->lock);
+
+ return r;
+}
+#endif
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg)
+{
+ void *argp = (void *)arg;
+ int r;
+
+ switch (ioctl) {
+ case KVM_ASSIGN_PCI_DEVICE: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_ASSIGN_IRQ: {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ case KVM_ASSIGN_DEV_IRQ: {
+ struct kvm_assigned_irq assigned_irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+ goto out;
+ r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_DEASSIGN_DEV_IRQ: {
+ struct kvm_assigned_irq assigned_irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+ goto out;
+ r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_DEASSIGN_PCI_DEVICE: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
+ if (r)
+ goto out;
+ break;
+ }
+#ifdef KVM_CAP_IRQ_ROUTING
+ case KVM_SET_GSI_ROUTING: {
+ struct kvm_irq_routing routing;
+ struct kvm_irq_routing *urouting;
+ struct kvm_irq_routing_entry *entries;
+
+ r = -EFAULT;
+ if (copy_from_user(&routing, argp, sizeof(routing)))
+ goto out;
+ r = -EINVAL;
+ if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+ goto out;
+ if (routing.flags)
+ goto out;
+ r = -ENOMEM;
+ entries = vmalloc(routing.nr * sizeof(*entries));
+ if (!entries)
+ goto out;
+ r = -EFAULT;
+ urouting = argp;
+ if (copy_from_user(entries, urouting->entries,
+ routing.nr * sizeof(*entries)))
+ goto out_free_irq_routing;
+ r = kvm_set_irq_routing(kvm, entries, routing.nr,
+ routing.flags);
+ out_free_irq_routing:
+ vfree(entries);
+ break;
+ }
+#endif /* KVM_CAP_IRQ_ROUTING */
+#ifdef __KVM_HAVE_MSIX
+ case KVM_ASSIGN_SET_MSIX_NR: {
+ struct kvm_assigned_msix_nr entry_nr;
+ r = -EFAULT;
+ if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+ goto out;
+ r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_ASSIGN_SET_MSIX_ENTRY: {
+ struct kvm_assigned_msix_entry entry;
+ r = -EFAULT;
+ if (copy_from_user(&entry, argp, sizeof entry))
+ goto out;
+ r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+ if (r)
+ goto out;
+ break;
+ }
+#endif
+ default:
+ r = -ENOTTY;
+ break;
+ }
+out:
+ return r;
+}
+
diff --git a/linux/x86/coalesced_mmio.c b/linux/x86/coalesced_mmio.c
new file mode 100644
index 0000000..850773e
--- /dev/null
+++ b/linux/x86/coalesced_mmio.c
@@ -0,0 +1,231 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * KVM coalesced MMIO
+ *
+ * Copyright (c) 2008 Bull S.A.S.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ */
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/kvm.h>
+
+#include "coalesced_mmio.h"
+
+static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_coalesced_mmio_dev, dev);
+}
+
+static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
+ gpa_t addr, int len)
+{
+ struct kvm_coalesced_mmio_zone *zone;
+ struct kvm_coalesced_mmio_ring *ring;
+ unsigned avail;
+ int i;
+
+ /* Are we able to batch it ? */
+
+ /* last is the first free entry
+ * check if we don't meet the first used entry
+ * there is always one unused entry in the buffer
+ */
+ ring = dev->kvm->coalesced_mmio_ring;
+ avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
+ if (avail < KVM_MAX_VCPUS) {
+ /* full */
+ return 0;
+ }
+
+ /* is it in a batchable area ? */
+
+ for (i = 0; i < dev->nb_zones; i++) {
+ zone = &dev->zone[i];
+
+ /* (addr,len) is fully included in
+ * (zone->addr, zone->size)
+ */
+
+ if (zone->addr <= addr &&
+ addr + len <= zone->addr + zone->size)
+ return 1;
+ }
+ return 0;
+}
+
+static int coalesced_mmio_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *val)
+{
+ struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
+ struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
+ if (!coalesced_mmio_in_range(dev, addr, len))
+ return -EOPNOTSUPP;
+
+ spin_lock(&dev->lock);
+
+ /* copy data in first free entry of the ring */
+
+ ring->coalesced_mmio[ring->last].phys_addr = addr;
+ ring->coalesced_mmio[ring->last].len = len;
+ memcpy(ring->coalesced_mmio[ring->last].data, val, len);
+ smp_wmb();
+ ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
+ spin_unlock(&dev->lock);
+ return 0;
+}
+
+static void coalesced_mmio_destructor(struct kvm_io_device *this)
+{
+ struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
+
+ kfree(dev);
+}
+
+static const struct kvm_io_device_ops coalesced_mmio_ops = {
+ .write = coalesced_mmio_write,
+ .destructor = coalesced_mmio_destructor,
+};
+
+int kvm_coalesced_mmio_init(struct kvm *kvm)
+{
+ struct kvm_coalesced_mmio_dev *dev;
+ struct page *page;
+ int ret;
+
+ ret = -ENOMEM;
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto out_err;
+ kvm->coalesced_mmio_ring = page_address(page);
+
+ ret = -ENOMEM;
+ dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
+ if (!dev)
+ goto out_free_page;
+ spin_lock_init(&dev->lock);
+ kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
+ dev->kvm = kvm;
+ kvm->coalesced_mmio_dev = dev;
+
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0)
+ goto out_free_dev;
+
+ return ret;
+
+out_free_dev:
+ kvm->coalesced_mmio_dev = NULL;
+ kfree(dev);
+out_free_page:
+ kvm->coalesced_mmio_ring = NULL;
+ __free_page(page);
+out_err:
+ return ret;
+}
+
+void kvm_coalesced_mmio_free(struct kvm *kvm)
+{
+ if (kvm->coalesced_mmio_ring)
+ free_page((unsigned long)kvm->coalesced_mmio_ring);
+}
+
+int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
+ struct kvm_coalesced_mmio_zone *zone)
+{
+ struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+
+ if (dev == NULL)
+ return -ENXIO;
+
+ mutex_lock(&kvm->slots_lock);
+ if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
+ mutex_unlock(&kvm->slots_lock);
+ return -ENOBUFS;
+ }
+
+ dev->zone[dev->nb_zones] = *zone;
+ dev->nb_zones++;
+
+ mutex_unlock(&kvm->slots_lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
+ struct kvm_coalesced_mmio_zone *zone)
+{
+ int i;
+ struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+ struct kvm_coalesced_mmio_zone *z;
+
+ if (dev == NULL)
+ return -ENXIO;
+
+ mutex_lock(&kvm->slots_lock);
+
+ i = dev->nb_zones;
+ while (i) {
+ z = &dev->zone[i - 1];
+
+ /* unregister all zones
+ * included in (zone->addr, zone->size)
+ */
+
+ if (zone->addr <= z->addr &&
+ z->addr + z->size <= zone->addr + zone->size) {
+ dev->nb_zones--;
+ *z = dev->zone[dev->nb_zones];
+ }
+ i--;
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ return 0;
+}
diff --git a/linux/x86/coalesced_mmio.h b/linux/x86/coalesced_mmio.h
new file mode 100644
index 0000000..8a5959e
--- /dev/null
+++ b/linux/x86/coalesced_mmio.h
@@ -0,0 +1,39 @@
+#ifndef __KVM_COALESCED_MMIO_H__
+#define __KVM_COALESCED_MMIO_H__
+
+/*
+ * KVM coalesced MMIO
+ *
+ * Copyright (c) 2008 Bull S.A.S.
+ *
+ * Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ */
+
+#ifdef CONFIG_KVM_MMIO
+
+#define KVM_COALESCED_MMIO_ZONE_MAX 100
+
+struct kvm_coalesced_mmio_dev {
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+ spinlock_t lock;
+ int nb_zones;
+ struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
+};
+
+int kvm_coalesced_mmio_init(struct kvm *kvm);
+void kvm_coalesced_mmio_free(struct kvm *kvm);
+int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
+ struct kvm_coalesced_mmio_zone *zone);
+int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
+ struct kvm_coalesced_mmio_zone *zone);
+
+#else
+
+static inline int kvm_coalesced_mmio_init(struct kvm *kvm) { return 0; }
+static inline void kvm_coalesced_mmio_free(struct kvm *kvm) { }
+
+#endif
+
+#endif
diff --git a/linux/x86/emulate.c b/linux/x86/emulate.c
new file mode 100644
index 0000000..cffdf3c
--- /dev/null
+++ b/linux/x86/emulate.c
@@ -0,0 +1,3771 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/******************************************************************************
+ * emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+#include <linux/module.h>
+#include <asm/kvm_emulate.h>
+
+#include "x86.h"
+#include "tss.h"
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp (1<<0) /* 8-bit operands. */
+/* Destination operand type. */
+#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
+#define DstReg (2<<1) /* Register operand. */
+#define DstMem (3<<1) /* Memory operand. */
+#define DstAcc (4<<1) /* Destination Accumulator */
+#define DstDI (5<<1) /* Destination is in ES:(E)DI */
+#define DstMem64 (6<<1) /* 64bit memory operand */
+#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
+#define DstMask (7<<1)
+/* Source operand type. */
+#define SrcNone (0<<4) /* No source operand. */
+#define SrcReg (1<<4) /* Register operand. */
+#define SrcMem (2<<4) /* Memory operand. */
+#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
+#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
+#define SrcImm (5<<4) /* Immediate operand. */
+#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
+#define SrcOne (7<<4) /* Implied '1' */
+#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
+#define SrcImmU (9<<4) /* Immediate operand, unsigned */
+#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
+#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
+#define SrcAcc (0xd<<4) /* Source Accumulator */
+#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */
+#define SrcMask (0xf<<4)
+/* Generic ModRM decode. */
+#define ModRM (1<<8)
+/* Destination is only written; never read. */
+#define Mov (1<<9)
+#define BitOp (1<<10)
+#define MemAbs (1<<11) /* Memory operand is absolute displacement */
+#define String (1<<12) /* String instruction (rep capable) */
+#define Stack (1<<13) /* Stack instruction (push/pop) */
+#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
+/* Misc flags */
+#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
+#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
+#define Undefined (1<<25) /* No Such Instruction */
+#define Lock (1<<26) /* lock prefix is allowed for the instruction */
+#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
+#define No64 (1<<28)
+/* Source 2 operand type */
+#define Src2None (0<<29)
+#define Src2CL (1<<29)
+#define Src2ImmByte (2<<29)
+#define Src2One (3<<29)
+#define Src2Imm (4<<29)
+#define Src2Mask (7<<29)
+
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
+
+struct opcode {
+ u32 flags;
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ struct opcode *group;
+ struct group_dual *gdual;
+ } u;
+};
+
+struct group_dual {
+ struct opcode mod012[8];
+ struct opcode mod3[8];
+};
+
+/* EFLAGS bit definitions. */
+#define EFLG_ID (1<<21)
+#define EFLG_VIP (1<<20)
+#define EFLG_VIF (1<<19)
+#define EFLG_AC (1<<18)
+#define EFLG_VM (1<<17)
+#define EFLG_RF (1<<16)
+#define EFLG_IOPL (3<<12)
+#define EFLG_NT (1<<14)
+#define EFLG_OF (1<<11)
+#define EFLG_DF (1<<10)
+#define EFLG_IF (1<<9)
+#define EFLG_TF (1<<8)
+#define EFLG_SF (1<<7)
+#define EFLG_ZF (1<<6)
+#define EFLG_AF (1<<4)
+#define EFLG_PF (1<<2)
+#define EFLG_CF (1<<0)
+
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
+#define EFLG_RESERVED_ONE_MASK 2
+
+/*
+ * Instruction emulation:
+ * Most instructions are emulated directly via a fragment of inline assembly
+ * code. This allows us to save/restore EFLAGS and thus very easily pick up
+ * any modified flags.
+ */
+
+#if defined(CONFIG_X86_64)
+#define _LO32 "k" /* force 32-bit operand */
+#define _STK "%%rsp" /* stack pointer */
+#elif defined(__i386__)
+#define _LO32 "" /* force 32-bit operand */
+#define _STK "%%esp" /* stack pointer */
+#endif
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+
+/* Before executing instruction: restore necessary bits in EFLAGS. */
+#define _PRE_EFLAGS(_sav, _msk, _tmp) \
+ /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
+ "movl %"_sav",%"_LO32 _tmp"; " \
+ "push %"_tmp"; " \
+ "push %"_tmp"; " \
+ "movl %"_msk",%"_LO32 _tmp"; " \
+ "andl %"_LO32 _tmp",("_STK"); " \
+ "pushf; " \
+ "notl %"_LO32 _tmp"; " \
+ "andl %"_LO32 _tmp",("_STK"); " \
+ "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
+ "pop %"_tmp"; " \
+ "orl %"_LO32 _tmp",("_STK"); " \
+ "popf; " \
+ "pop %"_sav"; "
+
+/* After executing instruction: write-back necessary bits in EFLAGS. */
+#define _POST_EFLAGS(_sav, _msk, _tmp) \
+ /* _sav |= EFLAGS & _msk; */ \
+ "pushf; " \
+ "pop %"_tmp"; " \
+ "andl %"_msk",%"_LO32 _tmp"; " \
+ "orl %"_LO32 _tmp",%"_sav"; "
+
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
+ do { \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "2") \
+ _op _suffix " %"_x"3,%1; " \
+ _POST_EFLAGS("0", "4", "2") \
+ : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
+ "=&r" (_tmp) \
+ : _y ((_src).val), "i" (EFLAGS_MASK)); \
+ } while (0)
+
+
+/* Raw emulation: instruction has two explicit operands. */
+#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
+ do { \
+ unsigned long _tmp; \
+ \
+ switch ((_dst).bytes) { \
+ case 2: \
+ ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
+ break; \
+ case 4: \
+ ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
+ break; \
+ case 8: \
+ ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
+ break; \
+ } \
+ } while (0)
+
+#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
+ do { \
+ unsigned long _tmp; \
+ switch ((_dst).bytes) { \
+ case 1: \
+ ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
+ break; \
+ default: \
+ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
+ _wx, _wy, _lx, _ly, _qx, _qy); \
+ break; \
+ } \
+ } while (0)
+
+/* Source operand is byte-sized and may be restricted to just %cl. */
+#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
+ __emulate_2op(_op, _src, _dst, _eflags, \
+ "b", "c", "b", "c", "b", "c", "b", "c")
+
+/* Source operand is byte, word, long or quad sized. */
+#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
+ __emulate_2op(_op, _src, _dst, _eflags, \
+ "b", "q", "w", "r", _LO32, "r", "", "r")
+
+/* Source operand is word, long or quad sized. */
+#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
+ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
+ "w", "r", _LO32, "r", "", "r")
+
+/* Instruction has three operands and one operand is stored in ECX register */
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
+ do { \
+ unsigned long _tmp; \
+ _type _clv = (_cl).val; \
+ _type _srcv = (_src).val; \
+ _type _dstv = (_dst).val; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "5", "2") \
+ _op _suffix " %4,%1 \n" \
+ _POST_EFLAGS("0", "5", "2") \
+ : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
+ : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
+ ); \
+ \
+ (_cl).val = (unsigned long) _clv; \
+ (_src).val = (unsigned long) _srcv; \
+ (_dst).val = (unsigned long) _dstv; \
+ } while (0)
+
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
+ do { \
+ switch ((_dst).bytes) { \
+ case 2: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "w", unsigned short); \
+ break; \
+ case 4: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "l", unsigned int); \
+ break; \
+ case 8: \
+ ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "q", unsigned long)); \
+ break; \
+ } \
+ } while (0)
+
+#define __emulate_1op(_op, _dst, _eflags, _suffix) \
+ do { \
+ unsigned long _tmp; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "3", "2") \
+ _op _suffix " %1; " \
+ _POST_EFLAGS("0", "3", "2") \
+ : "=m" (_eflags), "+m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : "i" (EFLAGS_MASK)); \
+ } while (0)
+
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags) \
+ do { \
+ switch ((_dst).bytes) { \
+ case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \
+ case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \
+ case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \
+ case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
+ } \
+ } while (0)
+
+#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \
+ do { \
+ unsigned long _tmp; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "1") \
+ _op _suffix " %5; " \
+ _POST_EFLAGS("0", "4", "1") \
+ : "=m" (_eflags), "=&r" (_tmp), \
+ "+a" (_rax), "+d" (_rdx) \
+ : "i" (EFLAGS_MASK), "m" ((_src).val), \
+ "a" (_rax), "d" (_rdx)); \
+ } while (0)
+
+#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
+ do { \
+ unsigned long _tmp; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "5", "1") \
+ "1: \n\t" \
+ _op _suffix " %6; " \
+ "2: \n\t" \
+ _POST_EFLAGS("0", "5", "1") \
+ ".pushsection .fixup,\"ax\" \n\t" \
+ "3: movb $1, %4 \n\t" \
+ "jmp 2b \n\t" \
+ ".popsection \n\t" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=m" (_eflags), "=&r" (_tmp), \
+ "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \
+ : "i" (EFLAGS_MASK), "m" ((_src).val), \
+ "a" (_rax), "d" (_rdx)); \
+ } while (0)
+
+/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
+#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
+ do { \
+ switch((_src).bytes) { \
+ case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
+ case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \
+ case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
+ case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
+ } \
+ } while (0)
+
+#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
+ do { \
+ switch((_src).bytes) { \
+ case 1: \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "b", _ex); \
+ break; \
+ case 2: \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "w", _ex); \
+ break; \
+ case 4: \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "l", _ex); \
+ break; \
+ case 8: ON64( \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "q", _ex)); \
+ break; \
+ } \
+ } while (0)
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip) \
+({ unsigned long _x; \
+ rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ (_eip) += (_size); \
+ (_type)_x; \
+})
+
+#define insn_fetch_arr(_arr, _size, _eip) \
+({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ (_eip) += (_size); \
+})
+
+static inline unsigned long ad_mask(struct decode_cache *c)
+{
+ return (1UL << (c->ad_bytes << 3)) - 1;
+}
+
+/* Access/update address held in a register, based on addressing mode. */
+static inline unsigned long
+address_mask(struct decode_cache *c, unsigned long reg)
+{
+ if (c->ad_bytes == sizeof(unsigned long))
+ return reg;
+ else
+ return reg & ad_mask(c);
+}
+
+static inline unsigned long
+register_address(struct decode_cache *c, unsigned long reg)
+{
+ return address_mask(c, reg);
+}
+
+static inline void
+register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+{
+ if (c->ad_bytes == sizeof(unsigned long))
+ *reg += inc;
+ else
+ *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+}
+
+static inline void jmp_rel(struct decode_cache *c, int rel)
+{
+ register_address_increment(c, &c->eip, rel);
+}
+
+static void set_seg_override(struct decode_cache *c, int seg)
+{
+ c->has_seg_override = true;
+ c->seg_override = seg;
+}
+
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
+{
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
+ return 0;
+
+ return ops->get_cached_segment_base(seg, ctxt->vcpu);
+}
+
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct decode_cache *c)
+{
+ if (!c->has_seg_override)
+ return 0;
+
+ return c->seg_override;
+}
+
+static ulong linear(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr)
+{
+ struct decode_cache *c = &ctxt->decode;
+ ulong la;
+
+ la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+ if (c->ad_bytes != 8)
+ la &= (u32)-1;
+ return la;
+}
+
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
+{
+ ctxt->exception.vector = vec;
+ ctxt->exception.error_code = error;
+ ctxt->exception.error_code_valid = valid;
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, GP_VECTOR, err, true);
+}
+
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, TS_VECTOR, err, true);
+}
+
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DE_VECTOR, 0, false);
+}
+
+static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long eip, u8 *dest)
+{
+ struct fetch_cache *fc = &ctxt->decode.fetch;
+ int rc;
+ int size, cur_size;
+
+ if (eip == fc->end) {
+ cur_size = fc->end - fc->start;
+ size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
+ rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
+ size, ctxt->vcpu, &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ fc->end += size;
+ }
+ *dest = fc->data[eip - fc->start];
+ return X86EMUL_CONTINUE;
+}
+
+static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long eip, void *dest, unsigned size)
+{
+ int rc;
+
+ /* x86 instructions are limited to 15 bytes. */
+ if (eip + size - ctxt->eip > 15)
+ return X86EMUL_UNHANDLEABLE;
+ while (size--) {
+ rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(u8 modrm_reg, unsigned long *regs,
+ int highbyte_regs)
+{
+ void *p;
+
+ p = &regs[modrm_reg];
+ if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+ p = (unsigned char *)&regs[modrm_reg & 3] + 1;
+ return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct segmented_address addr,
+ u16 *size, unsigned long *address, int op_bytes)
+{
+ int rc;
+
+ if (op_bytes == 2)
+ op_bytes = 3;
+ *address = 0;
+ rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
+ ctxt->vcpu, &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr.ea += 2;
+ rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
+ ctxt->vcpu, &ctxt->exception);
+ return rc;
+}
+
+static int test_cc(unsigned int condition, unsigned int flags)
+{
+ int rc = 0;
+
+ switch ((condition & 15) >> 1) {
+ case 0: /* o */
+ rc |= (flags & EFLG_OF);
+ break;
+ case 1: /* b/c/nae */
+ rc |= (flags & EFLG_CF);
+ break;
+ case 2: /* z/e */
+ rc |= (flags & EFLG_ZF);
+ break;
+ case 3: /* be/na */
+ rc |= (flags & (EFLG_CF|EFLG_ZF));
+ break;
+ case 4: /* s */
+ rc |= (flags & EFLG_SF);
+ break;
+ case 5: /* p/pe */
+ rc |= (flags & EFLG_PF);
+ break;
+ case 7: /* le/ng */
+ rc |= (flags & EFLG_ZF);
+ /* fall through */
+ case 6: /* l/nge */
+ rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
+ break;
+ }
+
+ /* Odd condition identifiers (lsb == 1) have inverted sense. */
+ return (!!rc ^ (condition & 1));
+}
+
+static void fetch_register_operand(struct operand *op)
+{
+ switch (op->bytes) {
+ case 1:
+ op->val = *(u8 *)op->addr.reg;
+ break;
+ case 2:
+ op->val = *(u16 *)op->addr.reg;
+ break;
+ case 4:
+ op->val = *(u32 *)op->addr.reg;
+ break;
+ case 8:
+ op->val = *(u64 *)op->addr.reg;
+ break;
+ }
+}
+
+static void decode_register_operand(struct operand *op,
+ struct decode_cache *c,
+ int inhibit_bytereg)
+{
+ unsigned reg = c->modrm_reg;
+ int highbyte_regs = c->rex_prefix == 0;
+
+ if (!(c->d & ModRM))
+ reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+ op->type = OP_REG;
+ if ((c->d & ByteOp) && !inhibit_bytereg) {
+ op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
+ op->bytes = 1;
+ } else {
+ op->addr.reg = decode_register(reg, c->regs, 0);
+ op->bytes = c->op_bytes;
+ }
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct operand *op)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u8 sib;
+ int index_reg = 0, base_reg = 0, scale;
+ int rc = X86EMUL_CONTINUE;
+ ulong modrm_ea = 0;
+
+ if (c->rex_prefix) {
+ c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
+ index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
+ c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+ }
+
+ c->modrm = insn_fetch(u8, 1, c->eip);
+ c->modrm_mod |= (c->modrm & 0xc0) >> 6;
+ c->modrm_reg |= (c->modrm & 0x38) >> 3;
+ c->modrm_rm |= (c->modrm & 0x07);
+ c->modrm_seg = VCPU_SREG_DS;
+
+ if (c->modrm_mod == 3) {
+ op->type = OP_REG;
+ op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ op->addr.reg = decode_register(c->modrm_rm,
+ c->regs, c->d & ByteOp);
+ fetch_register_operand(op);
+ return rc;
+ }
+
+ op->type = OP_MEM;
+
+ if (c->ad_bytes == 2) {
+ unsigned bx = c->regs[VCPU_REGS_RBX];
+ unsigned bp = c->regs[VCPU_REGS_RBP];
+ unsigned si = c->regs[VCPU_REGS_RSI];
+ unsigned di = c->regs[VCPU_REGS_RDI];
+
+ /* 16-bit ModR/M decode. */
+ switch (c->modrm_mod) {
+ case 0:
+ if (c->modrm_rm == 6)
+ modrm_ea += insn_fetch(u16, 2, c->eip);
+ break;
+ case 1:
+ modrm_ea += insn_fetch(s8, 1, c->eip);
+ break;
+ case 2:
+ modrm_ea += insn_fetch(u16, 2, c->eip);
+ break;
+ }
+ switch (c->modrm_rm) {
+ case 0:
+ modrm_ea += bx + si;
+ break;
+ case 1:
+ modrm_ea += bx + di;
+ break;
+ case 2:
+ modrm_ea += bp + si;
+ break;
+ case 3:
+ modrm_ea += bp + di;
+ break;
+ case 4:
+ modrm_ea += si;
+ break;
+ case 5:
+ modrm_ea += di;
+ break;
+ case 6:
+ if (c->modrm_mod != 0)
+ modrm_ea += bp;
+ break;
+ case 7:
+ modrm_ea += bx;
+ break;
+ }
+ if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
+ (c->modrm_rm == 6 && c->modrm_mod != 0))
+ c->modrm_seg = VCPU_SREG_SS;
+ modrm_ea = (u16)modrm_ea;
+ } else {
+ /* 32/64-bit ModR/M decode. */
+ if ((c->modrm_rm & 7) == 4) {
+ sib = insn_fetch(u8, 1, c->eip);
+ index_reg |= (sib >> 3) & 7;
+ base_reg |= sib & 7;
+ scale = sib >> 6;
+
+ if ((base_reg & 7) == 5 && c->modrm_mod == 0)
+ modrm_ea += insn_fetch(s32, 4, c->eip);
+ else
+ modrm_ea += c->regs[base_reg];
+ if (index_reg != 4)
+ modrm_ea += c->regs[index_reg] << scale;
+ } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ c->rip_relative = 1;
+ } else
+ modrm_ea += c->regs[c->modrm_rm];
+ switch (c->modrm_mod) {
+ case 0:
+ if (c->modrm_rm == 5)
+ modrm_ea += insn_fetch(s32, 4, c->eip);
+ break;
+ case 1:
+ modrm_ea += insn_fetch(s8, 1, c->eip);
+ break;
+ case 2:
+ modrm_ea += insn_fetch(s32, 4, c->eip);
+ break;
+ }
+ }
+ op->addr.mem.ea = modrm_ea;
+done:
+ return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct operand *op)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_MEM;
+ switch (c->ad_bytes) {
+ case 2:
+ op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
+ break;
+ case 4:
+ op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
+ break;
+ case 8:
+ op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
+ break;
+ }
+done:
+ return rc;
+}
+
+static void fetch_bit_operand(struct decode_cache *c)
+{
+ long sv = 0, mask;
+
+ if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
+ mask = ~(c->dst.bytes * 8 - 1);
+
+ if (c->src.bytes == 2)
+ sv = (s16)c->src.val & (s16)mask;
+ else if (c->src.bytes == 4)
+ sv = (s32)c->src.val & (s32)mask;
+
+ c->dst.addr.mem.ea += (sv >> 3);
+ }
+
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+}
+
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long addr, void *dest, unsigned size)
+{
+ int rc;
+ struct read_cache *mc = &ctxt->decode.mem_read;
+
+ while (size) {
+ int n = min(size, 8u);
+ size -= n;
+ if (mc->pos < mc->end)
+ goto read_cached;
+
+ rc = ops->read_emulated(addr, mc->data + mc->end, n,
+ &ctxt->exception, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ mc->end += n;
+
+ read_cached:
+ memcpy(dest, mc->data + mc->pos, n);
+ mc->pos += n;
+ dest += n;
+ addr += n;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned int size, unsigned short port,
+ void *dest)
+{
+ struct read_cache *rc = &ctxt->decode.io_read;
+
+ if (rc->pos == rc->end) { /* refill pio read ahead */
+ struct decode_cache *c = &ctxt->decode;
+ unsigned int in_page, n;
+ unsigned int count = c->rep_prefix ?
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+ in_page = (ctxt->eflags & EFLG_DF) ?
+ offset_in_page(c->regs[VCPU_REGS_RDI]) :
+ PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+ n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
+ count);
+ if (n == 0)
+ n = 1;
+ rc->pos = rc->end = 0;
+ if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+ return 0;
+ rc->end = n * size;
+ }
+
+ memcpy(dest, rc->data + rc->pos, size);
+ rc->pos += size;
+ return 1;
+}
+
+static u32 desc_limit_scaled(struct kvm_desc_struct *desc)
+{
+ u32 limit = kvm_get_desc_limit(desc);
+
+ return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, struct kvm_desc_ptr *dt)
+{
+ if (selector & 1 << 2) {
+ struct kvm_desc_struct desc;
+ memset (dt, 0, sizeof *dt);
+ if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+ return;
+
+ dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+ dt->address = kvm_get_desc_base(&desc);
+ } else
+ ops->get_gdt(dt, ctxt->vcpu);
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, struct kvm_desc_struct *desc)
+{
+ struct kvm_desc_ptr dt;
+ u16 index = selector >> 3;
+ int ret;
+ ulong addr;
+
+ get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
+ addr = dt.address + index * 8;
+ ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
+ &ctxt->exception);
+
+ return ret;
+}
+
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, struct kvm_desc_struct *desc)
+{
+ struct kvm_desc_ptr dt;
+ u16 index = selector >> 3;
+ ulong addr;
+ int ret;
+
+ get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
+
+ addr = dt.address + index * 8;
+ ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
+ &ctxt->exception);
+
+ return ret;
+}
+
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, int seg)
+{
+ struct kvm_desc_struct seg_desc;
+ u8 dpl, rpl, cpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ int ret;
+
+ memset(&seg_desc, 0, sizeof seg_desc);
+
+ if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
+ || ctxt->mode == X86EMUL_MODE_REAL) {
+ /* set real mode segment descriptor */
+ kvm_set_desc_base(&seg_desc, selector << 4);
+ kvm_set_desc_limit(&seg_desc, 0xffff);
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ goto load;
+ }
+
+ /* NULL selector is not valid for TR, CS and SS */
+ if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ && null_selector)
+ goto exception;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ if (null_selector) /* for NULL selector skip all following checks */
+ goto load;
+
+ ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ err_code = selector & 0xfffc;
+ err_vec = GP_VECTOR;
+
+ /* can't load system descriptor into segment selecor */
+ if (seg <= VCPU_SREG_GS && !seg_desc.s)
+ goto exception;
+
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ rpl = selector & 3;
+ dpl = seg_desc.dpl;
+ cpl = ops->cpl(ctxt->vcpu);
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(seg_desc.type & 8))
+ goto exception;
+
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (seg_desc.s || seg_desc.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((seg_desc.type & 0xa) == 0x8 ||
+ (((seg_desc.type & 0xc) != 0xc) &&
+ (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (seg_desc.s) {
+ /* mark segment as accessed */
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+load:
+ ops->set_segment_selector(selector, seg, ctxt->vcpu);
+ ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+ return X86EMUL_CONTINUE;
+exception:
+ emulate_exception(ctxt, err_vec, err_code, true);
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
+static void write_register_operand(struct operand *op)
+{
+ /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+ switch (op->bytes) {
+ case 1:
+ *(u8 *)op->addr.reg = (u8)op->val;
+ break;
+ case 2:
+ *(u16 *)op->addr.reg = (u16)op->val;
+ break;
+ case 4:
+ *op->addr.reg = (u32)op->val;
+ break; /* 64b: zero-extend */
+ case 8:
+ *op->addr.reg = op->val;
+ break;
+ }
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ int rc;
+ struct decode_cache *c = &ctxt->decode;
+
+ switch (c->dst.type) {
+ case OP_REG:
+ write_register_operand(&c->dst);
+ break;
+ case OP_MEM:
+ if (c->lock_prefix)
+ rc = ops->cmpxchg_emulated(
+ linear(ctxt, c->dst.addr.mem),
+ &c->dst.orig_val,
+ &c->dst.val,
+ c->dst.bytes,
+ &ctxt->exception,
+ ctxt->vcpu);
+ else
+ rc = ops->write_emulated(
+ linear(ctxt, c->dst.addr.mem),
+ &c->dst.val,
+ c->dst.bytes,
+ &ctxt->exception,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.type = OP_MEM;
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = c->src.val;
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
+ c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+ c->dst.addr.mem.seg = VCPU_SREG_SS;
+}
+
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ void *dest, int len)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+ struct segmented_address addr;
+
+ addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+ addr.seg = VCPU_SREG_SS;
+ rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
+ return rc;
+}
+
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ void *dest, int len)
+{
+ int rc;
+ unsigned long val, change_mask;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ int cpl = ops->cpl(ctxt->vcpu);
+
+ rc = emulate_pop(ctxt, ops, &val, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
+ | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
+
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_PROT64:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT16:
+ if (cpl == 0)
+ change_mask |= EFLG_IOPL;
+ if (cpl <= iopl)
+ change_mask |= EFLG_IF;
+ break;
+ case X86EMUL_MODE_VM86:
+ if (iopl < 3)
+ return emulate_gp(ctxt, 0);
+ change_mask |= EFLG_IF;
+ break;
+ default: /* real mode */
+ change_mask |= (EFLG_IOPL | EFLG_IF);
+ break;
+ }
+
+ *(unsigned long *)dest =
+ (ctxt->eflags & ~change_mask) | (val & change_mask);
+
+ return rc;
+}
+
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
+
+ emulate_push(ctxt, ops);
+}
+
+static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
+{
+ struct decode_cache *c = &ctxt->decode;
+ unsigned long selector;
+ int rc;
+
+ rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
+ return rc;
+}
+
+static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+ int rc = X86EMUL_CONTINUE;
+ int reg = VCPU_REGS_RAX;
+
+ while (reg <= VCPU_REGS_RDI) {
+ (reg == VCPU_REGS_RSP) ?
+ (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
+
+ emulate_push(ctxt, ops);
+
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ++reg;
+ }
+
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+
+ return rc;
+}
+
+static int emulate_popa(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
+ int reg = VCPU_REGS_RDI;
+
+ while (reg >= VCPU_REGS_RAX) {
+ if (reg == VCPU_REGS_RSP) {
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP],
+ c->op_bytes);
+ --reg;
+ }
+
+ rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ break;
+ --reg;
+ }
+ return rc;
+}
+
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int irq)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+ struct kvm_desc_ptr dt;
+ gva_t cs_addr;
+ gva_t eip_addr;
+ u16 cs, eip;
+
+ /* TODO: Add limit checks */
+ c->src.val = ctxt->eflags;
+ emulate_push(ctxt, ops);
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
+
+ c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ emulate_push(ctxt, ops);
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->src.val = c->eip;
+ emulate_push(ctxt, ops);
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->dst.type = OP_NONE;
+
+ ops->get_idt(&dt, ctxt->vcpu);
+
+ eip_addr = dt.address + (irq << 2);
+ cs_addr = dt.address + (irq << 2) + 2;
+
+ rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->eip = eip;
+
+ return rc;
+}
+
+static int emulate_int(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int irq)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_int_real(ctxt, ops, irq);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* Protected mode interrupts unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
+ unsigned long temp_eip = 0;
+ unsigned long temp_eflags = 0;
+ unsigned long cs = 0;
+ unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
+ EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
+ EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
+ unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
+
+ /* TODO: Add stack limit check */
+
+ rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (temp_eip & ~0xffff)
+ return emulate_gp(ctxt, 0);
+
+ rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->eip = temp_eip;
+
+
+ if (c->op_bytes == 4)
+ ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+ else if (c->op_bytes == 2) {
+ ctxt->eflags &= ~0xffff;
+ ctxt->eflags |= temp_eflags;
+ }
+
+ ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+ ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+
+ return rc;
+}
+
+static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops* ops)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_iret_real(ctxt, ops);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* iret from protected mode unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
+}
+
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ switch (c->modrm_reg) {
+ case 0: /* rol */
+ emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+ break;
+ case 1: /* ror */
+ emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+ break;
+ case 2: /* rcl */
+ emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+ break;
+ case 3: /* rcr */
+ emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+ break;
+ case 4: /* sal/shl */
+ case 6: /* sal/shl */
+ emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+ break;
+ case 5: /* shr */
+ emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+ break;
+ case 7: /* sar */
+ emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+ break;
+ }
+}
+
+static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ unsigned long *rax = &c->regs[VCPU_REGS_RAX];
+ unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
+ u8 de = 0;
+
+ switch (c->modrm_reg) {
+ case 0 ... 1: /* test */
+ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+ break;
+ case 2: /* not */
+ c->dst.val = ~c->dst.val;
+ break;
+ case 3: /* neg */
+ emulate_1op("neg", c->dst, ctxt->eflags);
+ break;
+ case 4: /* mul */
+ emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
+ break;
+ case 5: /* imul */
+ emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
+ break;
+ case 6: /* div */
+ emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
+ ctxt->eflags, de);
+ break;
+ case 7: /* idiv */
+ emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
+ ctxt->eflags, de);
+ break;
+ default:
+ return X86EMUL_UNHANDLEABLE;
+ }
+ if (de)
+ return emulate_de(ctxt);
+ return X86EMUL_CONTINUE;
+}
+
+static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ switch (c->modrm_reg) {
+ case 0: /* inc */
+ emulate_1op("inc", c->dst, ctxt->eflags);
+ break;
+ case 1: /* dec */
+ emulate_1op("dec", c->dst, ctxt->eflags);
+ break;
+ case 2: /* call near abs */ {
+ long int old_eip;
+ old_eip = c->eip;
+ c->eip = c->src.val;
+ c->src.val = old_eip;
+ emulate_push(ctxt, ops);
+ break;
+ }
+ case 4: /* jmp abs */
+ c->eip = c->src.val;
+ break;
+ case 6: /* push */
+ emulate_push(ctxt, ops);
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u64 old = c->dst.orig_val64;
+
+ if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
+ ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
+ c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+ c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+ ctxt->eflags &= ~EFLG_ZF;
+ } else {
+ c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+ (u32) c->regs[VCPU_REGS_RBX];
+
+ ctxt->eflags |= EFLG_ZF;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+ unsigned long cs;
+
+ rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ if (c->op_bytes == 4)
+ c->eip = (u32)c->eip;
+ rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+ return rc;
+}
+
+static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
+{
+ struct decode_cache *c = &ctxt->decode;
+ unsigned short sel;
+ int rc;
+
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+ rc = load_segment_descriptor(ctxt, ops, sel, seg);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->dst.val = c->src.val;
+ return rc;
+}
+
+static inline void
+setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, struct kvm_desc_struct *cs,
+ struct kvm_desc_struct *ss)
+{
+ memset(cs, 0, sizeof(struct kvm_desc_struct));
+ ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
+ memset(ss, 0, sizeof(struct kvm_desc_struct));
+
+ cs->l = 0; /* will be adjusted later */
+ kvm_set_desc_base(cs, 0); /* flat segment */
+ cs->g = 1; /* 4kb granularity */
+ kvm_set_desc_limit(cs, 0xfffff); /* 4GB limit */
+ cs->type = 0x0b; /* Read, Execute, Accessed */
+ cs->s = 1;
+ cs->dpl = 0; /* will be adjusted later */
+ cs->p = 1;
+ cs->d = 1;
+
+ kvm_set_desc_base(ss, 0); /* flat segment */
+ kvm_set_desc_limit(ss, 0xfffff); /* 4GB limit */
+ ss->g = 1; /* 4kb granularity */
+ ss->s = 1;
+ ss->type = 0x03; /* Read/Write, Accessed */
+ ss->d = 1; /* 32bit stack segment */
+ ss->dpl = 0;
+ ss->p = 1;
+}
+
+static int
+emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_desc_struct cs, ss;
+ u64 msr_data;
+ u16 cs_sel, ss_sel;
+
+ /* syscall is not available in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_ud(ctxt);
+
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
+
+ ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ msr_data >>= 32;
+ cs_sel = (u16)(msr_data & 0xfffc);
+ ss_sel = (u16)(msr_data + 8);
+
+ if (is_long_mode(ctxt->vcpu)) {
+ cs.d = 0;
+ cs.l = 1;
+ }
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+
+ c->regs[VCPU_REGS_RCX] = c->eip;
+ if (is_long_mode(ctxt->vcpu)) {
+#ifdef CONFIG_X86_64
+ c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+
+ ops->get_msr(ctxt->vcpu,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
+ c->eip = msr_data;
+
+ ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ ctxt->eflags &= ~(msr_data | EFLG_RF);
+#endif
+ } else {
+ /* legacy mode */
+ ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ c->eip = (u32)msr_data;
+
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int
+emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_desc_struct cs, ss;
+ u64 msr_data;
+ u16 cs_sel, ss_sel;
+
+ /* inject #GP if in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return emulate_gp(ctxt, 0);
+
+ /* XXX sysenter/sysexit have not been tested in 64bit mode.
+ * Therefore, we inject an #UD.
+ */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return emulate_ud(ctxt);
+
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
+
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (ctxt->mode) {
+ case X86EMUL_MODE_PROT32:
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+ break;
+ case X86EMUL_MODE_PROT64:
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
+ break;
+ }
+
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ cs_sel = (u16)msr_data;
+ cs_sel &= ~SELECTOR_RPL_MASK;
+ ss_sel = cs_sel + 8;
+ ss_sel &= ~SELECTOR_RPL_MASK;
+ if (ctxt->mode == X86EMUL_MODE_PROT64
+ || is_long_mode(ctxt->vcpu)) {
+ cs.d = 0;
+ cs.l = 1;
+ }
+
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+ c->eip = msr_data;
+
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+ c->regs[VCPU_REGS_RSP] = msr_data;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int
+emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_desc_struct cs, ss;
+ u64 msr_data;
+ int usermode;
+ u16 cs_sel, ss_sel;
+
+ /* inject #GP if in real mode or Virtual 8086 mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_gp(ctxt, 0);
+
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
+
+ if ((c->rex_prefix & 0x8) != 0x0)
+ usermode = X86EMUL_MODE_PROT64;
+ else
+ usermode = X86EMUL_MODE_PROT32;
+
+ cs.dpl = 3;
+ ss.dpl = 3;
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (usermode) {
+ case X86EMUL_MODE_PROT32:
+ cs_sel = (u16)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = (u16)(msr_data + 24);
+ break;
+ case X86EMUL_MODE_PROT64:
+ cs_sel = (u16)(msr_data + 32);
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = cs_sel + 8;
+ cs.d = 0;
+ cs.l = 1;
+ break;
+ }
+ cs_sel |= SELECTOR_RPL_MASK;
+ ss_sel |= SELECTOR_RPL_MASK;
+
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+
+ c->eip = c->regs[VCPU_REGS_RDX];
+ c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
+
+ return X86EMUL_CONTINUE;
+}
+
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ int iopl;
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return false;
+ if (ctxt->mode == X86EMUL_MODE_VM86)
+ return true;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ return ops->cpl(ctxt->vcpu) > iopl;
+}
+
+static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 port, u16 len)
+{
+ struct kvm_desc_struct tr_seg;
+ int r;
+ u16 io_bitmap_ptr;
+ u8 perm, bit_idx = port & 0x7;
+ unsigned mask = (1 << len) - 1;
+
+ ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
+ if (!tr_seg.p)
+ return false;
+ if (desc_limit_scaled(&tr_seg) < 103)
+ return false;
+ r = ops->read_std(kvm_get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
+ ctxt->vcpu, NULL);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
+ return false;
+ r = ops->read_std(kvm_get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
+ &perm, 1, ctxt->vcpu, NULL);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if ((perm >> bit_idx) & mask)
+ return false;
+ return true;
+}
+
+static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 port, u16 len)
+{
+ if (ctxt->perm_ok)
+ return true;
+
+ if (emulator_bad_iopl(ctxt, ops))
+ if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
+ return false;
+
+ ctxt->perm_ok = true;
+
+ return true;
+}
+
+static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_16 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ tss->ip = c->eip;
+ tss->flag = ctxt->eflags;
+ tss->ax = c->regs[VCPU_REGS_RAX];
+ tss->cx = c->regs[VCPU_REGS_RCX];
+ tss->dx = c->regs[VCPU_REGS_RDX];
+ tss->bx = c->regs[VCPU_REGS_RBX];
+ tss->sp = c->regs[VCPU_REGS_RSP];
+ tss->bp = c->regs[VCPU_REGS_RBP];
+ tss->si = c->regs[VCPU_REGS_RSI];
+ tss->di = c->regs[VCPU_REGS_RDI];
+
+ tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+ tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+ tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+ tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_16 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int ret;
+
+ c->eip = tss->ip;
+ ctxt->eflags = tss->flag | 2;
+ c->regs[VCPU_REGS_RAX] = tss->ax;
+ c->regs[VCPU_REGS_RCX] = tss->cx;
+ c->regs[VCPU_REGS_RDX] = tss->dx;
+ c->regs[VCPU_REGS_RBX] = tss->bx;
+ c->regs[VCPU_REGS_RSP] = tss->sp;
+ c->regs[VCPU_REGS_RBP] = tss->bp;
+ c->regs[VCPU_REGS_RSI] = tss->si;
+ c->regs[VCPU_REGS_RDI] = tss->di;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
+ ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+ ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int task_switch_16(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, u16 old_tss_sel,
+ ulong old_tss_base, struct kvm_desc_struct *new_desc)
+{
+ struct tss_segment_16 tss_seg;
+ int ret;
+ u32 new_tss_base = kvm_get_desc_base(new_desc);
+
+ ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ /* FIXME: need to provide precise fault address */
+ return ret;
+
+ save_state_to_tss16(ctxt, ops, &tss_seg);
+
+ ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ /* FIXME: need to provide precise fault address */
+ return ret;
+
+ ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ /* FIXME: need to provide precise fault address */
+ return ret;
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = ops->write_std(new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link,
+ ctxt->vcpu, &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ /* FIXME: need to provide precise fault address */
+ return ret;
+ }
+
+ return load_state_from_tss16(ctxt, ops, &tss_seg);
+}
+
+static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_32 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ tss->cr3 = ops->get_cr(3, ctxt->vcpu);
+ tss->eip = c->eip;
+ tss->eflags = ctxt->eflags;
+ tss->eax = c->regs[VCPU_REGS_RAX];
+ tss->ecx = c->regs[VCPU_REGS_RCX];
+ tss->edx = c->regs[VCPU_REGS_RDX];
+ tss->ebx = c->regs[VCPU_REGS_RBX];
+ tss->esp = c->regs[VCPU_REGS_RSP];
+ tss->ebp = c->regs[VCPU_REGS_RBP];
+ tss->esi = c->regs[VCPU_REGS_RSI];
+ tss->edi = c->regs[VCPU_REGS_RDI];
+
+ tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+ tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+ tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+ tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
+ tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
+ tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_32 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int ret;
+
+ if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
+ return emulate_gp(ctxt, 0);
+ c->eip = tss->eip;
+ ctxt->eflags = tss->eflags | 2;
+ c->regs[VCPU_REGS_RAX] = tss->eax;
+ c->regs[VCPU_REGS_RCX] = tss->ecx;
+ c->regs[VCPU_REGS_RDX] = tss->edx;
+ c->regs[VCPU_REGS_RBX] = tss->ebx;
+ c->regs[VCPU_REGS_RSP] = tss->esp;
+ c->regs[VCPU_REGS_RBP] = tss->ebp;
+ c->regs[VCPU_REGS_RSI] = tss->esi;
+ c->regs[VCPU_REGS_RDI] = tss->edi;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
+ ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+ ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+ ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
+ ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int task_switch_32(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, u16 old_tss_sel,
+ ulong old_tss_base, struct kvm_desc_struct *new_desc)
+{
+ struct tss_segment_32 tss_seg;
+ int ret;
+ u32 new_tss_base = kvm_get_desc_base(new_desc);
+
+ ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ /* FIXME: need to provide precise fault address */
+ return ret;
+
+ save_state_to_tss32(ctxt, ops, &tss_seg);
+
+ ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ /* FIXME: need to provide precise fault address */
+ return ret;
+
+ ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ /* FIXME: need to provide precise fault address */
+ return ret;
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = ops->write_std(new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link,
+ ctxt->vcpu, &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ /* FIXME: need to provide precise fault address */
+ return ret;
+ }
+
+ return load_state_from_tss32(ctxt, ops, &tss_seg);
+}
+
+static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code)
+{
+ struct kvm_desc_struct curr_tss_desc, next_tss_desc;
+ int ret;
+ u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
+ ulong old_tss_base =
+ ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
+ u32 desc_limit;
+
+ /* FIXME: old_tss_base == ~0 ? */
+
+ ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ /* FIXME: check that next_tss_desc is tss */
+
+ if (reason != TASK_SWITCH_IRET) {
+ if ((tss_selector & 3) > next_tss_desc.dpl ||
+ ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
+ return emulate_gp(ctxt, 0);
+ }
+
+ desc_limit = desc_limit_scaled(&next_tss_desc);
+ if (!next_tss_desc.p ||
+ ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
+ desc_limit < 0x2b)) {
+ emulate_ts(ctxt, tss_selector & 0xfffc);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
+ write_segment_descriptor(ctxt, ops, old_tss_sel,
+ &curr_tss_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET)
+ ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ if (next_tss_desc.type & 8)
+ ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ else
+ ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+ ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
+
+ if (reason != TASK_SWITCH_IRET) {
+ next_tss_desc.type |= (1 << 1); /* set busy flag */
+ write_segment_descriptor(ctxt, ops, tss_selector,
+ &next_tss_desc);
+ }
+
+ ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
+ ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
+ ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
+
+ if (has_error_code) {
+ struct decode_cache *c = &ctxt->decode;
+
+ c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+ c->lock_prefix = 0;
+ c->src.val = (unsigned long) error_code;
+ emulate_push(ctxt, ops);
+ }
+
+ return ret;
+}
+
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code)
+{
+ struct x86_emulate_ops *ops = ctxt->ops;
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ c->eip = ctxt->eip;
+ c->dst.type = OP_NONE;
+
+ rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+ has_error_code, error_code);
+
+ if (rc == X86EMUL_CONTINUE) {
+ rc = writeback(ctxt, ops);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->eip = c->eip;
+ }
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
+ int reg, struct operand *op)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+
+ register_address_increment(c, &c->regs[reg], df * op->bytes);
+ op->addr.mem.ea = register_address(c, c->regs[reg]);
+ op->addr.mem.seg = seg;
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_push(ctxt, ctxt->ops);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_das(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u8 al, old_al;
+ bool af, cf, old_cf;
+
+ cf = ctxt->eflags & X86_EFLAGS_CF;
+ al = c->dst.val;
+
+ old_al = al;
+ old_cf = cf;
+ cf = false;
+ af = ctxt->eflags & X86_EFLAGS_AF;
+ if ((al & 0x0f) > 9 || af) {
+ al -= 6;
+ cf = old_cf | (al >= 250);
+ af = true;
+ } else {
+ af = false;
+ }
+ if (old_al > 0x99 || old_cf) {
+ al -= 0x60;
+ cf = true;
+ }
+
+ c->dst.val = al;
+ /* Set PF, ZF, SF */
+ c->src.type = OP_IMM;
+ c->src.val = 0;
+ c->src.bytes = 1;
+ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+ ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+ if (cf)
+ ctxt->eflags |= X86_EFLAGS_CF;
+ if (af)
+ ctxt->eflags |= X86_EFLAGS_AF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u16 sel, old_cs;
+ ulong old_eip;
+ int rc;
+
+ old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ old_eip = c->eip;
+
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+ if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
+ return X86EMUL_CONTINUE;
+
+ c->eip = 0;
+ memcpy(&c->eip, c->src.valptr, c->op_bytes);
+
+ c->src.val = old_cs;
+ emulate_push(ctxt, ctxt->ops);
+ rc = writeback(ctxt, ctxt->ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->src.val = old_eip;
+ emulate_push(ctxt, ctxt->ops);
+ rc = writeback(ctxt, ctxt->ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->dst.type = OP_NONE;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ c->dst.type = OP_REG;
+ c->dst.addr.reg = &c->eip;
+ c->dst.bytes = c->op_bytes;
+ rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.val = c->src2.val;
+ return em_imul(ctxt);
+}
+
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.type = OP_REG;
+ c->dst.bytes = c->src.bytes;
+ c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
+ c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
+ struct decode_cache *c = &ctxt->decode;
+ u64 tsc = 0;
+
+ if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
+ return emulate_gp(ctxt, 0);
+ ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
+ c->regs[VCPU_REGS_RAX] = (u32)tsc;
+ c->regs[VCPU_REGS_RDX] = tsc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ c->dst.val = c->src.val;
+ return X86EMUL_CONTINUE;
+}
+
+#define D(_y) { .flags = (_y) }
+#define N D(0)
+#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+
+#define D2bv(_f) D((_f) | ByteOp), D(_f)
+#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
+
+#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \
+ D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \
+ D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
+
+
+static struct opcode group1[] = {
+ X7(D(Lock)), N
+};
+
+static struct opcode group1A[] = {
+ D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+};
+
+static struct opcode group3[] = {
+ D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
+ D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+ X4(D(SrcMem | ModRM)),
+};
+
+static struct opcode group4[] = {
+ D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+ N, N, N, N, N, N,
+};
+
+static struct opcode group5[] = {
+ D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+ D(SrcMem | ModRM | Stack),
+ I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
+ D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
+ D(SrcMem | ModRM | Stack), N,
+};
+
+static struct group_dual group7 = { {
+ N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
+ D(SrcNone | ModRM | DstMem | Mov), N,
+ D(SrcMem16 | ModRM | Mov | Priv),
+ D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
+}, {
+ D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+ D(SrcNone | ModRM | DstMem | Mov), N,
+ D(SrcMem16 | ModRM | Mov | Priv), N,
+} };
+
+static struct opcode group8[] = {
+ N, N, N, N,
+ D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
+ D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+};
+
+static struct group_dual group9 = { {
+ N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+}, {
+ N, N, N, N, N, N, N, N,
+} };
+
+static struct opcode group11[] = {
+ I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
+};
+
+static struct opcode opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ D6ALU(Lock),
+ D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ /* 0x08 - 0x0F */
+ D6ALU(Lock),
+ D(ImplicitOps | Stack | No64), N,
+ /* 0x10 - 0x17 */
+ D6ALU(Lock),
+ D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ /* 0x18 - 0x1F */
+ D6ALU(Lock),
+ D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ /* 0x20 - 0x27 */
+ D6ALU(Lock), N, N,
+ /* 0x28 - 0x2F */
+ D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
+ /* 0x30 - 0x37 */
+ D6ALU(Lock), N, N,
+ /* 0x38 - 0x3F */
+ D6ALU(0), N, N,
+ /* 0x40 - 0x4F */
+ X16(D(DstReg)),
+ /* 0x50 - 0x57 */
+ X8(I(SrcReg | Stack, em_push)),
+ /* 0x58 - 0x5F */
+ X8(D(DstReg | Stack)),
+ /* 0x60 - 0x67 */
+ D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+ N, N, N, N,
+ /* 0x68 - 0x6F */
+ I(SrcImm | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
+ I(SrcImmByte | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
+ D2bv(DstDI | Mov | String), /* insb, insw/insd */
+ D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
+ /* 0x70 - 0x7F */
+ X16(D(SrcImmByte)),
+ /* 0x80 - 0x87 */
+ G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
+ G(DstMem | SrcImm | ModRM | Group, group1),
+ G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
+ G(DstMem | SrcImmByte | ModRM | Group, group1),
+ D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
+ /* 0x88 - 0x8F */
+ I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
+ I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
+ D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
+ D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
+ /* 0x90 - 0x97 */
+ X8(D(SrcAcc | DstReg)),
+ /* 0x98 - 0x9F */
+ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
+ I(SrcImmFAddr | No64, em_call_far), N,
+ D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
+ /* 0xA0 - 0xA7 */
+ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
+ I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
+ I2bv(SrcSI | DstDI | Mov | String, em_mov),
+ D2bv(SrcSI | DstDI | String),
+ /* 0xA8 - 0xAF */
+ D2bv(DstAcc | SrcImm),
+ I2bv(SrcAcc | DstDI | Mov | String, em_mov),
+ I2bv(SrcSI | DstAcc | Mov | String, em_mov),
+ D2bv(SrcAcc | DstDI | String),
+ /* 0xB0 - 0xB7 */
+ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
+ /* 0xB8 - 0xBF */
+ X8(I(DstReg | SrcImm | Mov, em_mov)),
+ /* 0xC0 - 0xC7 */
+ D2bv(DstMem | SrcImmByte | ModRM),
+ I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
+ D(ImplicitOps | Stack),
+ D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
+ G(ByteOp, group11), G(0, group11),
+ /* 0xC8 - 0xCF */
+ N, N, N, D(ImplicitOps | Stack),
+ D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
+ /* 0xD0 - 0xD7 */
+ D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
+ N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ X4(D(SrcImmByte)),
+ D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
+ /* 0xE8 - 0xEF */
+ D(SrcImm | Stack), D(SrcImm | ImplicitOps),
+ D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
+ D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps),
+ /* 0xF0 - 0xF7 */
+ N, N, N, N,
+ D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
+ /* 0xF8 - 0xFF */
+ D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
+ D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
+};
+
+static struct opcode twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ N, GD(0, &group7), N, N,
+ N, D(ImplicitOps), D(ImplicitOps | Priv), N,
+ D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
+ N, D(ImplicitOps | ModRM), N, N,
+ /* 0x10 - 0x1F */
+ N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
+ /* 0x20 - 0x2F */
+ D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
+ D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
+ N, N, N, N,
+ N, N, N, N, N, N, N, N,
+ /* 0x30 - 0x3F */
+ D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
+ D(ImplicitOps | Priv), N,
+ D(ImplicitOps), D(ImplicitOps | Priv), N, N,
+ N, N, N, N, N, N, N, N,
+ /* 0x40 - 0x4F */
+ X16(D(DstReg | SrcMem | ModRM | Mov)),
+ /* 0x50 - 0x5F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x60 - 0x6F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x70 - 0x7F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x80 - 0x8F */
+ X16(D(SrcImm)),
+ /* 0x90 - 0x9F */
+ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
+ /* 0xA0 - 0xA7 */
+ D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+ N, D(DstMem | SrcReg | ModRM | BitOp),
+ D(DstMem | SrcReg | Src2ImmByte | ModRM),
+ D(DstMem | SrcReg | Src2CL | ModRM), N, N,
+ /* 0xA8 - 0xAF */
+ D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+ N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ D(DstMem | SrcReg | Src2ImmByte | ModRM),
+ D(DstMem | SrcReg | Src2CL | ModRM),
+ D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
+ /* 0xB0 - 0xB7 */
+ D2bv(DstMem | SrcReg | ModRM | Lock),
+ D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
+ D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xB8 - 0xBF */
+ N, N,
+ G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+ D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xC0 - 0xCF */
+ D2bv(DstMem | SrcReg | ModRM | Lock),
+ N, D(DstMem | SrcReg | ModRM | Mov),
+ N, N, N, GD(0, &group9),
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xDF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xEF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xFF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
+};
+
+#undef D
+#undef N
+#undef G
+#undef GD
+#undef I
+
+#undef D2bv
+#undef I2bv
+#undef D6ALU
+
+static unsigned imm_size(struct decode_cache *c)
+{
+ unsigned size;
+
+ size = (c->d & ByteOp) ? 1 : c->op_bytes;
+ if (size == 8)
+ size = 4;
+ return size;
+}
+
+static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned size, bool sign_extension)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct x86_emulate_ops *ops = ctxt->ops;
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_IMM;
+ op->bytes = size;
+ op->addr.mem.ea = c->eip;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch (op->bytes) {
+ case 1:
+ op->val = insn_fetch(s8, 1, c->eip);
+ break;
+ case 2:
+ op->val = insn_fetch(s16, 2, c->eip);
+ break;
+ case 4:
+ op->val = insn_fetch(s32, 4, c->eip);
+ break;
+ }
+ if (!sign_extension) {
+ switch (op->bytes) {
+ case 1:
+ op->val &= 0xff;
+ break;
+ case 2:
+ op->val &= 0xffff;
+ break;
+ case 4:
+ op->val &= 0xffffffff;
+ break;
+ }
+ }
+done:
+ return rc;
+}
+
+int
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+{
+ struct x86_emulate_ops *ops = ctxt->ops;
+ struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
+ int mode = ctxt->mode;
+ int def_op_bytes, def_ad_bytes, dual, goffset;
+ struct opcode opcode, *g_mod012, *g_mod3;
+ struct operand memop = { .type = OP_NONE };
+
+ c->eip = ctxt->eip;
+ c->fetch.start = c->eip;
+ c->fetch.end = c->fetch.start + insn_len;
+ if (insn_len > 0)
+ memcpy(c->fetch.data, insn, insn_len);
+ ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
+
+ switch (mode) {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ def_op_bytes = def_ad_bytes = 2;
+ break;
+ case X86EMUL_MODE_PROT32:
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86EMUL_MODE_PROT64:
+ def_op_bytes = 4;
+ def_ad_bytes = 8;
+ break;
+#endif
+ default:
+ return -1;
+ }
+
+ c->op_bytes = def_op_bytes;
+ c->ad_bytes = def_ad_bytes;
+
+ /* Legacy prefixes. */
+ for (;;) {
+ switch (c->b = insn_fetch(u8, 1, c->eip)) {
+ case 0x66: /* operand-size override */
+ /* switch between 2/4 bytes */
+ c->op_bytes = def_op_bytes ^ 6;
+ break;
+ case 0x67: /* address-size override */
+ if (mode == X86EMUL_MODE_PROT64)
+ /* switch between 4/8 bytes */
+ c->ad_bytes = def_ad_bytes ^ 12;
+ else
+ /* switch between 2/4 bytes */
+ c->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x26: /* ES override */
+ case 0x2e: /* CS override */
+ case 0x36: /* SS override */
+ case 0x3e: /* DS override */
+ set_seg_override(c, (c->b >> 3) & 3);
+ break;
+ case 0x64: /* FS override */
+ case 0x65: /* GS override */
+ set_seg_override(c, c->b & 7);
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ c->rex_prefix = c->b;
+ continue;
+ case 0xf0: /* LOCK */
+ c->lock_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ c->rep_prefix = REPNE_PREFIX;
+ break;
+ case 0xf3: /* REP/REPE/REPZ */
+ c->rep_prefix = REPE_PREFIX;
+ break;
+ default:
+ goto done_prefixes;
+ }
+
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+ c->rex_prefix = 0;
+ }
+
+done_prefixes:
+
+ /* REX prefix. */
+ if (c->rex_prefix & 8)
+ c->op_bytes = 8; /* REX.W */
+
+ /* Opcode byte(s). */
+ opcode = opcode_table[c->b];
+ /* Two-byte opcode? */
+ if (c->b == 0x0f) {
+ c->twobyte = 1;
+ c->b = insn_fetch(u8, 1, c->eip);
+ opcode = twobyte_table[c->b];
+ }
+ c->d = opcode.flags;
+
+ if (c->d & Group) {
+ dual = c->d & GroupDual;
+ c->modrm = insn_fetch(u8, 1, c->eip);
+ --c->eip;
+
+ if (c->d & GroupDual) {
+ g_mod012 = opcode.u.gdual->mod012;
+ g_mod3 = opcode.u.gdual->mod3;
+ } else
+ g_mod012 = g_mod3 = opcode.u.group;
+
+ c->d &= ~(Group | GroupDual);
+
+ goffset = (c->modrm >> 3) & 7;
+
+ if ((c->modrm >> 6) == 3)
+ opcode = g_mod3[goffset];
+ else
+ opcode = g_mod012[goffset];
+ c->d |= opcode.flags;
+ }
+
+ c->execute = opcode.u.execute;
+
+ /* Unrecognised? */
+ if (c->d == 0 || (c->d & Undefined))
+ return -1;
+
+ if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+ c->op_bytes = 8;
+
+ if (c->d & Op3264) {
+ if (mode == X86EMUL_MODE_PROT64)
+ c->op_bytes = 8;
+ else
+ c->op_bytes = 4;
+ }
+
+ /* ModRM and SIB bytes. */
+ if (c->d & ModRM) {
+ rc = decode_modrm(ctxt, ops, &memop);
+ if (!c->has_seg_override)
+ set_seg_override(c, c->modrm_seg);
+ } else if (c->d & MemAbs)
+ rc = decode_abs(ctxt, ops, &memop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ if (!c->has_seg_override)
+ set_seg_override(c, VCPU_SREG_DS);
+
+ memop.addr.mem.seg = seg_override(ctxt, ops, c);
+
+ if (memop.type == OP_MEM && c->ad_bytes != 8)
+ memop.addr.mem.ea = (u32)memop.addr.mem.ea;
+
+ if (memop.type == OP_MEM && c->rip_relative)
+ memop.addr.mem.ea += c->eip;
+
+ /*
+ * Decode and fetch the source operand: register, memory
+ * or immediate.
+ */
+ switch (c->d & SrcMask) {
+ case SrcNone:
+ break;
+ case SrcReg:
+ decode_register_operand(&c->src, c, 0);
+ break;
+ case SrcMem16:
+ memop.bytes = 2;
+ goto srcmem_common;
+ case SrcMem32:
+ memop.bytes = 4;
+ goto srcmem_common;
+ case SrcMem:
+ memop.bytes = (c->d & ByteOp) ? 1 :
+ c->op_bytes;
+ srcmem_common:
+ c->src = memop;
+ break;
+ case SrcImmU16:
+ rc = decode_imm(ctxt, &c->src, 2, false);
+ break;
+ case SrcImm:
+ rc = decode_imm(ctxt, &c->src, imm_size(c), true);
+ break;
+ case SrcImmU:
+ rc = decode_imm(ctxt, &c->src, imm_size(c), false);
+ break;
+ case SrcImmByte:
+ rc = decode_imm(ctxt, &c->src, 1, true);
+ break;
+ case SrcImmUByte:
+ rc = decode_imm(ctxt, &c->src, 1, false);
+ break;
+ case SrcAcc:
+ c->src.type = OP_REG;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
+ fetch_register_operand(&c->src);
+ break;
+ case SrcOne:
+ c->src.bytes = 1;
+ c->src.val = 1;
+ break;
+ case SrcSI:
+ c->src.type = OP_MEM;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.addr.mem.ea =
+ register_address(c, c->regs[VCPU_REGS_RSI]);
+ c->src.addr.mem.seg = seg_override(ctxt, ops, c),
+ c->src.val = 0;
+ break;
+ case SrcImmFAddr:
+ c->src.type = OP_IMM;
+ c->src.addr.mem.ea = c->eip;
+ c->src.bytes = c->op_bytes + 2;
+ insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+ break;
+ case SrcMemFAddr:
+ memop.bytes = c->op_bytes + 2;
+ goto srcmem_common;
+ break;
+ }
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
+ */
+ switch (c->d & Src2Mask) {
+ case Src2None:
+ break;
+ case Src2CL:
+ c->src2.bytes = 1;
+ c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+ break;
+ case Src2ImmByte:
+ rc = decode_imm(ctxt, &c->src2, 1, true);
+ break;
+ case Src2One:
+ c->src2.bytes = 1;
+ c->src2.val = 1;
+ break;
+ case Src2Imm:
+ rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
+ break;
+ }
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /* Decode and fetch the destination operand: register or memory. */
+ switch (c->d & DstMask) {
+ case DstReg:
+ decode_register_operand(&c->dst, c,
+ c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+ break;
+ case DstImmUByte:
+ c->dst.type = OP_IMM;
+ c->dst.addr.mem.ea = c->eip;
+ c->dst.bytes = 1;
+ c->dst.val = insn_fetch(u8, 1, c->eip);
+ break;
+ case DstMem:
+ case DstMem64:
+ c->dst = memop;
+ if ((c->d & DstMask) == DstMem64)
+ c->dst.bytes = 8;
+ else
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ if (c->d & BitOp)
+ fetch_bit_operand(c);
+ c->dst.orig_val = c->dst.val;
+ break;
+ case DstAcc:
+ c->dst.type = OP_REG;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
+ fetch_register_operand(&c->dst);
+ c->dst.orig_val = c->dst.val;
+ break;
+ case DstDI:
+ c->dst.type = OP_MEM;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.addr.mem.ea =
+ register_address(c, c->regs[VCPU_REGS_RDI]);
+ c->dst.addr.mem.seg = VCPU_SREG_ES;
+ c->dst.val = 0;
+ break;
+ case ImplicitOps:
+ /* Special instructions do their own operand decoding. */
+ default:
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ return 0;
+ }
+
+done:
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ /* The second termination condition only applies for REPE
+ * and REPNE. Test if the repeat string operation prefix is
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+ * corresponding termination condition according to:
+ * - if REPE/REPZ and ZF = 0 then done
+ * - if REPNE/REPNZ and ZF = 1 then done
+ */
+ if (((c->b == 0xa6) || (c->b == 0xa7) ||
+ (c->b == 0xae) || (c->b == 0xaf))
+ && (((c->rep_prefix == REPE_PREFIX) &&
+ ((ctxt->eflags & EFLG_ZF) == 0))
+ || ((c->rep_prefix == REPNE_PREFIX) &&
+ ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+ return true;
+
+ return false;
+}
+
+int
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+{
+ struct x86_emulate_ops *ops = ctxt->ops;
+ u64 msr_data;
+ struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
+ int saved_dst_type = c->dst.type;
+ int irq; /* Used for int 3, int, and into */
+
+ ctxt->decode.mem_read.pos = 0;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ /* LOCK prefix is allowed only with some instructions */
+ if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
+ rc = emulate_gp(ctxt, 0);
+ goto done;
+ }
+
+ if (c->rep_prefix && (c->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
+ ctxt->eip = c->eip;
+ goto done;
+ }
+ }
+
+ if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
+ c->src.valptr, c->src.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ c->src.orig_val64 = c->src.val64;
+ }
+
+ if (c->src2.type == OP_MEM) {
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
+ &c->src2.val, c->src2.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if ((c->d & DstMask) == ImplicitOps)
+ goto special_insn;
+
+
+ if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
+ /* optimisation - avoid slow emulated read if Mov */
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
+ &c->dst.val, c->dst.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+ c->dst.orig_val = c->dst.val;
+
+special_insn:
+
+ if (c->execute) {
+ rc = c->execute(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ goto writeback;
+ }
+
+ if (c->twobyte)
+ goto twobyte_insn;
+
+ switch (c->b) {
+ case 0x00 ... 0x05:
+ add: /* add */
+ emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x06: /* push es */
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
+ break;
+ case 0x07: /* pop es */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
+ break;
+ case 0x08 ... 0x0d:
+ or: /* or */
+ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x0e: /* push cs */
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
+ break;
+ case 0x10 ... 0x15:
+ adc: /* adc */
+ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x16: /* push ss */
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
+ break;
+ case 0x17: /* pop ss */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
+ break;
+ case 0x18 ... 0x1d:
+ sbb: /* sbb */
+ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x1e: /* push ds */
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
+ break;
+ case 0x1f: /* pop ds */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
+ break;
+ case 0x20 ... 0x25:
+ and: /* and */
+ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x28 ... 0x2d:
+ sub: /* sub */
+ emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x30 ... 0x35:
+ xor: /* xor */
+ emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x38 ... 0x3d:
+ cmp: /* cmp */
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x40 ... 0x47: /* inc r16/r32 */
+ emulate_1op("inc", c->dst, ctxt->eflags);
+ break;
+ case 0x48 ... 0x4f: /* dec r16/r32 */
+ emulate_1op("dec", c->dst, ctxt->eflags);
+ break;
+ case 0x58 ... 0x5f: /* pop reg */
+ pop_instruction:
+ rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
+ break;
+ case 0x60: /* pusha */
+ rc = emulate_pusha(ctxt, ops);
+ break;
+ case 0x61: /* popa */
+ rc = emulate_popa(ctxt, ops);
+ break;
+ case 0x63: /* movsxd */
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ goto cannot_emulate;
+ c->dst.val = (s32) c->src.val;
+ break;
+ case 0x6c: /* insb */
+ case 0x6d: /* insw/insd */
+ c->src.val = c->regs[VCPU_REGS_RDX];
+ goto do_io_in;
+ case 0x6e: /* outsb */
+ case 0x6f: /* outsw/outsd */
+ c->dst.val = c->regs[VCPU_REGS_RDX];
+ goto do_io_out;
+ break;
+ case 0x70 ... 0x7f: /* jcc (short) */
+ if (test_cc(c->b, ctxt->eflags))
+ jmp_rel(c, c->src.val);
+ break;
+ case 0x80 ... 0x83: /* Grp1 */
+ switch (c->modrm_reg) {
+ case 0:
+ goto add;
+ case 1:
+ goto or;
+ case 2:
+ goto adc;
+ case 3:
+ goto sbb;
+ case 4:
+ goto and;
+ case 5:
+ goto sub;
+ case 6:
+ goto xor;
+ case 7:
+ goto cmp;
+ }
+ break;
+ case 0x84 ... 0x85:
+ test:
+ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x86 ... 0x87: /* xchg */
+ xchg:
+ /* Write back the register source. */
+ c->src.val = c->dst.val;
+ write_register_operand(&c->src);
+ /*
+ * Write back the memory destination with implicit LOCK
+ * prefix.
+ */
+ c->dst.val = c->src.orig_val;
+ c->lock_prefix = 1;
+ break;
+ case 0x8c: /* mov r/m, sreg */
+ if (c->modrm_reg > VCPU_SREG_GS) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+ c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
+ break;
+ case 0x8d: /* lea r16/r32, m */
+ c->dst.val = c->src.addr.mem.ea;
+ break;
+ case 0x8e: { /* mov seg, r/m16 */
+ uint16_t sel;
+
+ sel = c->src.val;
+
+ if (c->modrm_reg == VCPU_SREG_CS ||
+ c->modrm_reg > VCPU_SREG_GS) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (c->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
+ rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
+
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ }
+ case 0x8f: /* pop (sole member of Grp1a) */
+ rc = emulate_grp1a(ctxt, ops);
+ break;
+ case 0x90 ... 0x97: /* nop / xchg reg, rax */
+ if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
+ break;
+ goto xchg;
+ case 0x98: /* cbw/cwde/cdqe */
+ switch (c->op_bytes) {
+ case 2: c->dst.val = (s8)c->dst.val; break;
+ case 4: c->dst.val = (s16)c->dst.val; break;
+ case 8: c->dst.val = (s32)c->dst.val; break;
+ }
+ break;
+ case 0x9c: /* pushf */
+ c->src.val = (unsigned long) ctxt->eflags;
+ emulate_push(ctxt, ops);
+ break;
+ case 0x9d: /* popf */
+ c->dst.type = OP_REG;
+ c->dst.addr.reg = &ctxt->eflags;
+ c->dst.bytes = c->op_bytes;
+ rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
+ break;
+ case 0xa6 ... 0xa7: /* cmps */
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ goto cmp;
+ case 0xa8 ... 0xa9: /* test ax, imm */
+ goto test;
+ case 0xae ... 0xaf: /* scas */
+ goto cmp;
+ case 0xc0 ... 0xc1:
+ emulate_grp2(ctxt);
+ break;
+ case 0xc3: /* ret */
+ c->dst.type = OP_REG;
+ c->dst.addr.reg = &c->eip;
+ c->dst.bytes = c->op_bytes;
+ goto pop_instruction;
+ case 0xc4: /* les */
+ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
+ break;
+ case 0xc5: /* lds */
+ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
+ break;
+ case 0xcb: /* ret far */
+ rc = emulate_ret_far(ctxt, ops);
+ break;
+ case 0xcc: /* int3 */
+ irq = 3;
+ goto do_interrupt;
+ case 0xcd: /* int n */
+ irq = c->src.val;
+ do_interrupt:
+ rc = emulate_int(ctxt, ops, irq);
+ break;
+ case 0xce: /* into */
+ if (ctxt->eflags & EFLG_OF) {
+ irq = 4;
+ goto do_interrupt;
+ }
+ break;
+ case 0xcf: /* iret */
+ rc = emulate_iret(ctxt, ops);
+ break;
+ case 0xd0 ... 0xd1: /* Grp2 */
+ emulate_grp2(ctxt);
+ break;
+ case 0xd2 ... 0xd3: /* Grp2 */
+ c->src.val = c->regs[VCPU_REGS_RCX];
+ emulate_grp2(ctxt);
+ break;
+ case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
+ register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+ if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
+ (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
+ jmp_rel(c, c->src.val);
+ break;
+ case 0xe3: /* jcxz/jecxz/jrcxz */
+ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
+ jmp_rel(c, c->src.val);
+ break;
+ case 0xe4: /* inb */
+ case 0xe5: /* in */
+ goto do_io_in;
+ case 0xe6: /* outb */
+ case 0xe7: /* out */
+ goto do_io_out;
+ case 0xe8: /* call (near) */ {
+ long int rel = c->src.val;
+ c->src.val = (unsigned long) c->eip;
+ jmp_rel(c, rel);
+ emulate_push(ctxt, ops);
+ break;
+ }
+ case 0xe9: /* jmp rel */
+ goto jmp;
+ case 0xea: { /* jmp far */
+ unsigned short sel;
+ jump_far:
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+ if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
+ goto done;
+
+ c->eip = 0;
+ memcpy(&c->eip, c->src.valptr, c->op_bytes);
+ break;
+ }
+ case 0xeb:
+ jmp: /* jmp rel short */
+ jmp_rel(c, c->src.val);
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xec: /* in al,dx */
+ case 0xed: /* in (e/r)ax,dx */
+ c->src.val = c->regs[VCPU_REGS_RDX];
+ do_io_in:
+ c->dst.bytes = min(c->dst.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
+ rc = emulate_gp(ctxt, 0);
+ goto done;
+ }
+ if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
+ &c->dst.val))
+ goto done; /* IO is needed */
+ break;
+ case 0xee: /* out dx,al */
+ case 0xef: /* out dx,(e/r)ax */
+ c->dst.val = c->regs[VCPU_REGS_RDX];
+ do_io_out:
+ c->src.bytes = min(c->src.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ops, c->dst.val,
+ c->src.bytes)) {
+ rc = emulate_gp(ctxt, 0);
+ goto done;
+ }
+ ops->pio_out_emulated(c->src.bytes, c->dst.val,
+ &c->src.val, 1, ctxt->vcpu);
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xf4: /* hlt */
+ ctxt->vcpu->arch.halt_request = 1;
+ break;
+ case 0xf5: /* cmc */
+ /* complement carry flag from eflags reg */
+ ctxt->eflags ^= EFLG_CF;
+ break;
+ case 0xf6 ... 0xf7: /* Grp3 */
+ rc = emulate_grp3(ctxt, ops);
+ break;
+ case 0xf8: /* clc */
+ ctxt->eflags &= ~EFLG_CF;
+ break;
+ case 0xf9: /* stc */
+ ctxt->eflags |= EFLG_CF;
+ break;
+ case 0xfa: /* cli */
+ if (emulator_bad_iopl(ctxt, ops)) {
+ rc = emulate_gp(ctxt, 0);
+ goto done;
+ } else
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ break;
+ case 0xfb: /* sti */
+ if (emulator_bad_iopl(ctxt, ops)) {
+ rc = emulate_gp(ctxt, 0);
+ goto done;
+ } else {
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+ ctxt->eflags |= X86_EFLAGS_IF;
+ }
+ break;
+ case 0xfc: /* cld */
+ ctxt->eflags &= ~EFLG_DF;
+ break;
+ case 0xfd: /* std */
+ ctxt->eflags |= EFLG_DF;
+ break;
+ case 0xfe: /* Grp4 */
+ grp45:
+ rc = emulate_grp45(ctxt, ops);
+ break;
+ case 0xff: /* Grp5 */
+ if (c->modrm_reg == 5)
+ goto jump_far;
+ goto grp45;
+ default:
+ goto cannot_emulate;
+ }
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+writeback:
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /*
+ * restore dst type in case the decoding will be reused
+ * (happens for string instruction )
+ */
+ c->dst.type = saved_dst_type;
+
+ if ((c->d & SrcMask) == SrcSI)
+ string_addr_inc(ctxt, seg_override(ctxt, ops, c),
+ VCPU_REGS_RSI, &c->src);
+
+ if ((c->d & DstMask) == DstDI)
+ string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
+ &c->dst);
+
+ if (c->rep_prefix && (c->d & String)) {
+ struct read_cache *r = &ctxt->decode.io_read;
+ register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+
+ if (!string_insn_completed(ctxt)) {
+ /*
+ * Re-enter guest when pio read ahead buffer is empty
+ * or, if it is not used, after each 1024 iteration.
+ */
+ if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
+ (r->end == 0 || r->end != r->pos)) {
+ /*
+ * Reset read cache. Usually happens before
+ * decode, but since instruction is restarted
+ * we have to do it here.
+ */
+ ctxt->decode.mem_read.end = 0;
+ return EMULATION_RESTART;
+ }
+ goto done; /* skip rip writeback */
+ }
+ }
+
+ ctxt->eip = c->eip;
+
+done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+
+twobyte_insn:
+ switch (c->b) {
+ case 0x01: /* lgdt, lidt, lmsw */
+ switch (c->modrm_reg) {
+ u16 size;
+ unsigned long address;
+
+ case 0: /* vmcall */
+ if (c->modrm_mod != 3 || c->modrm_rm != 1)
+ goto cannot_emulate;
+
+ rc = kvm_fix_hypercall(ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /* Let the processor re-execute the fixed hypercall */
+ c->eip = ctxt->eip;
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ break;
+ case 2: /* lgdt */
+ rc = read_descriptor(ctxt, ops, c->src.addr.mem,
+ &size, &address, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ realmode_lgdt(ctxt->vcpu, size, address);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ break;
+ case 3: /* lidt/vmmcall */
+ if (c->modrm_mod == 3) {
+ switch (c->modrm_rm) {
+ case 1:
+ rc = kvm_fix_hypercall(ctxt->vcpu);
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ } else {
+ rc = read_descriptor(ctxt, ops, c->src.addr.mem,
+ &size, &address,
+ c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ realmode_lidt(ctxt->vcpu, size, address);
+ }
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ break;
+ case 4: /* smsw */
+ c->dst.bytes = 2;
+ c->dst.val = ops->get_cr(0, ctxt->vcpu);
+ break;
+ case 6: /* lmsw */
+ ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
+ (c->src.val & 0x0f), ctxt->vcpu);
+ c->dst.type = OP_NONE;
+ break;
+ case 5: /* not defined */
+ emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
+ goto done;
+ case 7: /* invlpg*/
+ emulate_invlpg(ctxt->vcpu,
+ linear(ctxt, c->src.addr.mem));
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ break;
+ case 0x05: /* syscall */
+ rc = emulate_syscall(ctxt, ops);
+ break;
+ case 0x06:
+ emulate_clts(ctxt->vcpu);
+ break;
+ case 0x09: /* wbinvd */
+ kvm_emulate_wbinvd(ctxt->vcpu);
+ break;
+ case 0x08: /* invd */
+ case 0x0d: /* GrpP (prefetch) */
+ case 0x18: /* Grp16 (prefetch/nop) */
+ break;
+ case 0x20: /* mov cr, reg */
+ switch (c->modrm_reg) {
+ case 1:
+ case 5 ... 7:
+ case 9 ... 15:
+ emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
+ goto done;
+ }
+ c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
+ break;
+ case 0x21: /* mov from dr to reg */
+ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+ (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+ emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
+ goto done;
+ }
+ ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
+ break;
+ case 0x22: /* mov reg, cr */
+ if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
+ emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
+ goto done;
+ }
+ c->dst.type = OP_NONE;
+ break;
+ case 0x23: /* mov from reg to dr */
+ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+ (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+ emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
+ goto done;
+ }
+
+ if (ops->set_dr(c->modrm_reg, c->src.val &
+ ((ctxt->mode == X86EMUL_MODE_PROT64) ?
+ ~0ULL : ~0U), ctxt->vcpu) < 0) {
+ /* #UD condition is already handled by the code above */
+ emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
+ goto done;
+ }
+
+ c->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x30:
+ /* wrmsr */
+ msr_data = (u32)c->regs[VCPU_REGS_RAX]
+ | ((u64)c->regs[VCPU_REGS_RDX] << 32);
+ if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
+ emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
+ goto done;
+ }
+ rc = X86EMUL_CONTINUE;
+ break;
+ case 0x32:
+ /* rdmsr */
+ if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
+ emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
+ goto done;
+ } else {
+ c->regs[VCPU_REGS_RAX] = (u32)msr_data;
+ c->regs[VCPU_REGS_RDX] = msr_data >> 32;
+ }
+ rc = X86EMUL_CONTINUE;
+ break;
+ case 0x34: /* sysenter */
+ rc = emulate_sysenter(ctxt, ops);
+ break;
+ case 0x35: /* sysexit */
+ rc = emulate_sysexit(ctxt, ops);
+ break;
+ case 0x40 ... 0x4f: /* cmov */
+ c->dst.val = c->dst.orig_val = c->src.val;
+ if (!test_cc(c->b, ctxt->eflags))
+ c->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x80 ... 0x8f: /* jnz rel, etc*/
+ if (test_cc(c->b, ctxt->eflags))
+ jmp_rel(c, c->src.val);
+ break;
+ case 0x90 ... 0x9f: /* setcc r/m8 */
+ c->dst.val = test_cc(c->b, ctxt->eflags);
+ break;
+ case 0xa0: /* push fs */
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
+ break;
+ case 0xa1: /* pop fs */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
+ break;
+ case 0xa3:
+ bt: /* bt */
+ c->dst.type = OP_NONE;
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+ emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xa4: /* shld imm8, r, r/m */
+ case 0xa5: /* shld cl, r, r/m */
+ emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xa8: /* push gs */
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
+ break;
+ case 0xa9: /* pop gs */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
+ break;
+ case 0xab:
+ bts: /* bts */
+ emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xac: /* shrd imm8, r, r/m */
+ case 0xad: /* shrd cl, r, r/m */
+ emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xae: /* clflush */
+ break;
+ case 0xb0 ... 0xb1: /* cmpxchg */
+ /*
+ * Save real source value, then compare EAX against
+ * destination.
+ */
+ c->src.orig_val = c->src.val;
+ c->src.val = c->regs[VCPU_REGS_RAX];
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+ if (ctxt->eflags & EFLG_ZF) {
+ /* Success: write back to memory. */
+ c->dst.val = c->src.orig_val;
+ } else {
+ /* Failure: write the value we saw to EAX. */
+ c->dst.type = OP_REG;
+ c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ }
+ break;
+ case 0xb2: /* lss */
+ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
+ break;
+ case 0xb3:
+ btr: /* btr */
+ emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xb4: /* lfs */
+ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
+ break;
+ case 0xb5: /* lgs */
+ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
+ break;
+ case 0xb6 ... 0xb7: /* movzx */
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
+ : (u16) c->src.val;
+ break;
+ case 0xba: /* Grp8 */
+ switch (c->modrm_reg & 3) {
+ case 0:
+ goto bt;
+ case 1:
+ goto bts;
+ case 2:
+ goto btr;
+ case 3:
+ goto btc;
+ }
+ break;
+ case 0xbb:
+ btc: /* btc */
+ emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xbc: { /* bsf */
+ u8 zf;
+ __asm__ ("bsf %2, %0; setz %1"
+ : "=r"(c->dst.val), "=q"(zf)
+ : "r"(c->src.val));
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ if (zf) {
+ ctxt->eflags |= X86_EFLAGS_ZF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
+ break;
+ }
+ case 0xbd: { /* bsr */
+ u8 zf;
+ __asm__ ("bsr %2, %0; setz %1"
+ : "=r"(c->dst.val), "=q"(zf)
+ : "r"(c->src.val));
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ if (zf) {
+ ctxt->eflags |= X86_EFLAGS_ZF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
+ break;
+ }
+ case 0xbe ... 0xbf: /* movsx */
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
+ (s16) c->src.val;
+ break;
+ case 0xc0 ... 0xc1: /* xadd */
+ emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+ /* Write back the register source. */
+ c->src.val = c->dst.orig_val;
+ write_register_operand(&c->src);
+ break;
+ case 0xc3: /* movnti */
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
+ (u64) c->src.val;
+ break;
+ case 0xc7: /* Grp9 (cmpxchg8b) */
+ rc = emulate_grp9(ctxt, ops);
+ break;
+ default:
+ goto cannot_emulate;
+ }
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ goto writeback;
+
+cannot_emulate:
+ return -1;
+}
diff --git a/linux/x86/eventfd.c b/linux/x86/eventfd.c
new file mode 100644
index 0000000..babaeed
--- /dev/null
+++ b/linux/x86/eventfd.c
@@ -0,0 +1,705 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * kvm eventfd support - use eventfd objects to signal various KVM events
+ *
+ * Copyright 2009 Novell. All Rights Reserved.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/workqueue.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/eventfd.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "iodev.h"
+
+/*
+ * --------------------------------------------------------------------
+ * irqfd: Allows an fd to be used to inject an interrupt to the guest
+ *
+ * Credit goes to Avi Kivity for the original idea.
+ * --------------------------------------------------------------------
+ */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)
+struct _irqfd {
+ /* Used for MSI fast-path */
+ struct kvm *kvm;
+ wait_queue_t wait;
+ /* Update side is protected by irqfds.lock */
+ struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
+ /* Used for level IRQ fast-path */
+ int gsi;
+ struct work_struct inject;
+ /* Used for setup/shutdown */
+ struct eventfd_ctx *eventfd;
+ struct list_head list;
+ poll_table pt;
+ struct work_struct shutdown;
+};
+
+static struct workqueue_struct *irqfd_cleanup_wq;
+
+static void
+irqfd_inject(struct work_struct *work)
+{
+ struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+ struct kvm *kvm = irqfd->kvm;
+
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+}
+
+/*
+ * Race-free decouple logic (ordering is critical)
+ */
+static void
+irqfd_shutdown(struct work_struct *work)
+{
+ struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
+ u64 cnt;
+
+ /*
+ * Synchronize with the wait-queue and unhook ourselves to prevent
+ * further events.
+ */
+ eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
+
+ /*
+ * We know no new events will be scheduled at this point, so block
+ * until all previously outstanding events have completed
+ */
+ flush_work(&irqfd->inject);
+
+ /*
+ * It is now safe to release the object's resources
+ */
+ eventfd_ctx_put(irqfd->eventfd);
+ kfree(irqfd);
+}
+
+
+/* assumes kvm->irqfds.lock is held */
+static bool
+irqfd_is_active(struct _irqfd *irqfd)
+{
+ return list_empty(&irqfd->list) ? false : true;
+}
+
+/*
+ * Mark the irqfd as inactive and schedule it for removal
+ *
+ * assumes kvm->irqfds.lock is held
+ */
+static void
+irqfd_deactivate(struct _irqfd *irqfd)
+{
+ BUG_ON(!irqfd_is_active(irqfd));
+
+ list_del_init(&irqfd->list);
+
+ queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
+}
+
+/*
+ * Called with wqh->lock held and interrupts disabled
+ */
+static int
+irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
+ unsigned long flags = (unsigned long)key;
+ struct kvm_kernel_irq_routing_entry *irq;
+ struct kvm *kvm = irqfd->kvm;
+
+ if (flags & POLLIN) {
+ rcu_read_lock();
+ irq = rcu_dereference(irqfd->irq_entry);
+ /* An event has been signaled, inject an interrupt */
+ if (irq)
+ kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+ else
+ schedule_work(&irqfd->inject);
+ rcu_read_unlock();
+ }
+
+ if (flags & POLLHUP) {
+ /* The eventfd is closing, detach from KVM */
+ unsigned long flags;
+
+ spin_lock_irqsave(&kvm->irqfds.lock, flags);
+
+ /*
+ * We must check if someone deactivated the irqfd before
+ * we could acquire the irqfds.lock since the item is
+ * deactivated from the KVM side before it is unhooked from
+ * the wait-queue. If it is already deactivated, we can
+ * simply return knowing the other side will cleanup for us.
+ * We cannot race against the irqfd going away since the
+ * other side is required to acquire wqh->lock, which we hold
+ */
+ if (irqfd_is_active(irqfd))
+ irqfd_deactivate(irqfd);
+
+ spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
+ }
+
+ return 0;
+}
+
+static void
+irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
+ poll_table *pt)
+{
+ struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
+ add_wait_queue(wqh, &irqfd->wait);
+}
+
+/* Must be called under irqfds.lock */
+static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
+ struct kvm_irq_routing_table *irq_rt)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct hlist_node *n;
+
+ if (irqfd->gsi >= irq_rt->nr_rt_entries) {
+ rcu_assign_pointer(irqfd->irq_entry, NULL);
+ return;
+ }
+
+ hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
+ /* Only fast-path MSI. */
+ if (e->type == KVM_IRQ_ROUTING_MSI)
+ rcu_assign_pointer(irqfd->irq_entry, e);
+ else
+ rcu_assign_pointer(irqfd->irq_entry, NULL);
+ }
+}
+
+static int
+kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
+{
+ struct kvm_irq_routing_table *irq_rt;
+ struct _irqfd *irqfd, *tmp;
+ struct file *file = NULL;
+ struct eventfd_ctx *eventfd = NULL;
+ int ret;
+ unsigned int events;
+
+ irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+ if (!irqfd)
+ return -ENOMEM;
+
+ irqfd->kvm = kvm;
+ irqfd->gsi = gsi;
+ INIT_LIST_HEAD(&irqfd->list);
+ INIT_WORK(&irqfd->inject, irqfd_inject);
+ INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
+
+ file = eventfd_fget(fd);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto fail;
+ }
+
+ eventfd = eventfd_ctx_fileget(file);
+ if (IS_ERR(eventfd)) {
+ ret = PTR_ERR(eventfd);
+ goto fail;
+ }
+
+ irqfd->eventfd = eventfd;
+
+ /*
+ * Install our own custom wake-up handling so we are notified via
+ * a callback whenever someone signals the underlying eventfd
+ */
+ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
+ init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
+
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ ret = 0;
+ list_for_each_entry(tmp, &kvm->irqfds.items, list) {
+ if (irqfd->eventfd != tmp->eventfd)
+ continue;
+ /* This fd is used for another irq already. */
+ ret = -EBUSY;
+ spin_unlock_irq(&kvm->irqfds.lock);
+ goto fail;
+ }
+
+ irq_rt = rcu_dereference_protected(kvm->irq_routing,
+ lockdep_is_held(&kvm->irqfds.lock));
+ irqfd_update(kvm, irqfd, irq_rt);
+
+ events = file->f_op->poll(file, &irqfd->pt);
+
+ list_add_tail(&irqfd->list, &kvm->irqfds.items);
+
+ /*
+ * Check if there was an event already pending on the eventfd
+ * before we registered, and trigger it as if we didn't miss it.
+ */
+ if (events & POLLIN)
+ schedule_work(&irqfd->inject);
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+ /*
+ * do not drop the file until the irqfd is fully initialized, otherwise
+ * we might race against the POLLHUP
+ */
+ fput(file);
+
+ return 0;
+
+fail:
+ if (eventfd && !IS_ERR(eventfd))
+ eventfd_ctx_put(eventfd);
+
+ if (!IS_ERR(file))
+ fput(file);
+
+ kfree(irqfd);
+ return ret;
+}
+
+void
+kvm_eventfd_init(struct kvm *kvm)
+{
+ spin_lock_init(&kvm->irqfds.lock);
+ INIT_LIST_HEAD(&kvm->irqfds.items);
+ INIT_LIST_HEAD(&kvm->ioeventfds);
+}
+
+/*
+ * shutdown any irqfd's that match fd+gsi
+ */
+static int
+kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
+{
+ struct _irqfd *irqfd, *tmp;
+ struct eventfd_ctx *eventfd;
+
+ eventfd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
+ if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
+ /*
+ * This rcu_assign_pointer is needed for when
+ * another thread calls kvm_irqfd_update before
+ * we flush workqueue below.
+ * It is paired with synchronize_rcu done by caller
+ * of that function.
+ */
+ rcu_assign_pointer(irqfd->irq_entry, NULL);
+ irqfd_deactivate(irqfd);
+ }
+ }
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+ eventfd_ctx_put(eventfd);
+
+ /*
+ * Block until we know all outstanding shutdown jobs have completed
+ * so that we guarantee there will not be any more interrupts on this
+ * gsi once this deassign function returns.
+ */
+ flush_workqueue(irqfd_cleanup_wq);
+
+ return 0;
+}
+
+int
+kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
+{
+ if (flags & KVM_IRQFD_FLAG_DEASSIGN)
+ return kvm_irqfd_deassign(kvm, fd, gsi);
+
+ return kvm_irqfd_assign(kvm, fd, gsi);
+}
+
+/*
+ * This function is called as the kvm VM fd is being released. Shutdown all
+ * irqfds that still remain open
+ */
+void
+kvm_irqfd_release(struct kvm *kvm)
+{
+ struct _irqfd *irqfd, *tmp;
+
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
+ irqfd_deactivate(irqfd);
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+ /*
+ * Block until we know all outstanding shutdown jobs have completed
+ * since we do not take a kvm* reference.
+ */
+ flush_workqueue(irqfd_cleanup_wq);
+
+}
+
+/*
+ * Change irq_routing and irqfd.
+ * Caller must invoke synchronize_rcu afterwards.
+ */
+void kvm_irq_routing_update(struct kvm *kvm,
+ struct kvm_irq_routing_table *irq_rt)
+{
+ struct _irqfd *irqfd;
+
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ rcu_assign_pointer(kvm->irq_routing, irq_rt);
+
+ list_for_each_entry(irqfd, &kvm->irqfds.items, list)
+ irqfd_update(kvm, irqfd, irq_rt);
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+}
+
+/*
+ * create a host-wide workqueue for issuing deferred shutdown requests
+ * aggregated from all vm* instances. We need our own isolated single-thread
+ * queue to prevent deadlock against flushing the normal work-queue.
+ */
+static int __init irqfd_module_init(void)
+{
+ irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
+ if (!irqfd_cleanup_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit irqfd_module_exit(void)
+{
+ destroy_workqueue(irqfd_cleanup_wq);
+}
+
+module_init(irqfd_module_init);
+module_exit(irqfd_module_exit);
+
+/*
+ * --------------------------------------------------------------------
+ * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
+ *
+ * userspace can register a PIO/MMIO address with an eventfd for receiving
+ * notification when the memory has been touched.
+ * --------------------------------------------------------------------
+ */
+
+struct _ioeventfd {
+ struct list_head list;
+ u64 addr;
+ int length;
+ struct eventfd_ctx *eventfd;
+ u64 datamatch;
+ struct kvm_io_device dev;
+ bool wildcard;
+};
+
+static inline struct _ioeventfd *
+to_ioeventfd(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct _ioeventfd, dev);
+}
+
+static void
+ioeventfd_release(struct _ioeventfd *p)
+{
+ eventfd_ctx_put(p->eventfd);
+ list_del(&p->list);
+ kfree(p);
+}
+
+static bool
+ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
+{
+ u64 _val;
+
+ if (!(addr == p->addr && len == p->length))
+ /* address-range must be precise for a hit */
+ return false;
+
+ if (p->wildcard)
+ /* all else equal, wildcard is always a hit */
+ return true;
+
+ /* otherwise, we have to actually compare the data */
+
+ BUG_ON(!IS_ALIGNED((unsigned long)val, len));
+
+ switch (len) {
+ case 1:
+ _val = *(u8 *)val;
+ break;
+ case 2:
+ _val = *(u16 *)val;
+ break;
+ case 4:
+ _val = *(u32 *)val;
+ break;
+ case 8:
+ _val = *(u64 *)val;
+ break;
+ default:
+ return false;
+ }
+
+ return _val == p->datamatch ? true : false;
+}
+
+/* MMIO/PIO writes trigger an event if the addr/val match */
+static int
+ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
+ const void *val)
+{
+ struct _ioeventfd *p = to_ioeventfd(this);
+
+ if (!ioeventfd_in_range(p, addr, len, val))
+ return -EOPNOTSUPP;
+
+ eventfd_signal(p->eventfd, 1);
+ return 0;
+}
+
+/*
+ * This function is called as KVM is completely shutting down. We do not
+ * need to worry about locking just nuke anything we have as quickly as possible
+ */
+static void
+ioeventfd_destructor(struct kvm_io_device *this)
+{
+ struct _ioeventfd *p = to_ioeventfd(this);
+
+ ioeventfd_release(p);
+}
+
+static const struct kvm_io_device_ops ioeventfd_ops = {
+ .write = ioeventfd_write,
+ .destructor = ioeventfd_destructor,
+};
+
+/* assumes kvm->slots_lock held */
+static bool
+ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
+{
+ struct _ioeventfd *_p;
+
+ list_for_each_entry(_p, &kvm->ioeventfds, list)
+ if (_p->addr == p->addr && _p->length == p->length &&
+ (_p->wildcard || p->wildcard ||
+ _p->datamatch == p->datamatch))
+ return true;
+
+ return false;
+}
+
+static int
+kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+ int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
+ enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
+ struct _ioeventfd *p;
+ struct eventfd_ctx *eventfd;
+ int ret;
+
+ /* must be natural-word sized */
+ switch (args->len) {
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* check for range overflow */
+ if (args->addr + args->len < args->addr)
+ return -EINVAL;
+
+ /* check for extra flags that we don't understand */
+ if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
+ return -EINVAL;
+
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ INIT_LIST_HEAD(&p->list);
+ p->addr = args->addr;
+ p->length = args->len;
+ p->eventfd = eventfd;
+
+ /* The datamatch feature is optional, otherwise this is a wildcard */
+ if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
+ p->datamatch = args->datamatch;
+ else
+ p->wildcard = true;
+
+ mutex_lock(&kvm->slots_lock);
+
+ /* Verify that there isnt a match already */
+ if (ioeventfd_check_collision(kvm, p)) {
+ ret = -EEXIST;
+ goto unlock_fail;
+ }
+
+ kvm_iodevice_init(&p->dev, &ioeventfd_ops);
+
+ ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
+ if (ret < 0)
+ goto unlock_fail;
+
+ list_add_tail(&p->list, &kvm->ioeventfds);
+
+ mutex_unlock(&kvm->slots_lock);
+
+ return 0;
+
+unlock_fail:
+ mutex_unlock(&kvm->slots_lock);
+
+fail:
+ kfree(p);
+ eventfd_ctx_put(eventfd);
+
+ return ret;
+}
+
+static int
+kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+ int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
+ enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
+ struct _ioeventfd *p, *tmp;
+ struct eventfd_ctx *eventfd;
+ int ret = -ENOENT;
+
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&kvm->slots_lock);
+
+ list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
+ bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
+
+ if (p->eventfd != eventfd ||
+ p->addr != args->addr ||
+ p->length != args->len ||
+ p->wildcard != wildcard)
+ continue;
+
+ if (!p->wildcard && p->datamatch != args->datamatch)
+ continue;
+
+ kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
+ ioeventfd_release(p);
+ ret = 0;
+ break;
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ eventfd_ctx_put(eventfd);
+
+ return ret;
+}
+
+int
+kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+ if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
+ return kvm_deassign_ioeventfd(kvm, args);
+
+ return kvm_assign_ioeventfd(kvm, args);
+}
+#else
+void kvm_eventfd_init(struct kvm *kvm) { }
+void kvm_irqfd_release(struct kvm *kvm) { }
+void kvm_irq_routing_update(struct kvm *kvm,
+ struct kvm_irq_routing_table *irq_rt)
+{
+ rcu_assign_pointer(kvm->irq_routing, irq_rt);
+}
+#endif
diff --git a/linux/x86/i8254.c b/linux/x86/i8254.c
new file mode 100644
index 0000000..64aa827
--- /dev/null
+++ b/linux/x86/i8254.c
@@ -0,0 +1,803 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * 8253/8254 interval timer emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2006 Intel Corporation
+ * Copyright (c) 2007 Keir Fraser, XenSource Inc
+ * Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * Authors:
+ * Sheng Yang <sheng.yang@intel.com>
+ * Based on QEMU and Xen.
+ */
+
+
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "irq.h"
+#include "i8254.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+#define RW_STATE_LSB 1
+#define RW_STATE_MSB 2
+#define RW_STATE_WORD0 3
+#define RW_STATE_WORD1 4
+
+/* Compute with 96 bit intermediate result: (a*b)/c */
+static u64 muldiv64(u64 a, u32 b, u32 c)
+{
+ union {
+ u64 ll;
+ struct {
+ u32 low, high;
+ } l;
+ } u, res;
+ u64 rl, rh;
+
+ u.ll = a;
+ rl = (u64)u.l.low * (u64)b;
+ rh = (u64)u.l.high * (u64)b;
+ rh += (rl >> 32);
+ res.l.high = div64_u64(rh, c);
+ res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
+ return res.ll;
+}
+
+static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ switch (c->mode) {
+ default:
+ case 0:
+ case 4:
+ /* XXX: just disable/enable counting */
+ break;
+ case 1:
+ case 2:
+ case 3:
+ case 5:
+ /* Restart counting on rising edge. */
+ if (c->gate < val)
+ c->count_load_time = ktime_get();
+ break;
+ }
+
+ c->gate = val;
+}
+
+static int pit_get_gate(struct kvm *kvm, int channel)
+{
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ return kvm->arch.vpit->pit_state.channels[channel].gate;
+}
+
+static s64 __kpit_elapsed(struct kvm *kvm)
+{
+ s64 elapsed;
+ ktime_t remaining;
+ struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+ if (!ps->pit_timer.period)
+ return 0;
+
+ /*
+ * The Counter does not stop when it reaches zero. In
+ * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+ * the highest count, either FFFF hex for binary counting
+ * or 9999 for BCD counting, and continues counting.
+ * Modes 2 and 3 are periodic; the Counter reloads
+ * itself with the initial count and continues counting
+ * from there.
+ */
+ remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
+ elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+ elapsed = mod_64(elapsed, ps->pit_timer.period);
+
+ return elapsed;
+}
+
+static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+ int channel)
+{
+ if (channel == 0)
+ return __kpit_elapsed(kvm);
+
+ return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+}
+
+static int pit_get_count(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+ s64 d, t;
+ int counter;
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ t = kpit_elapsed(kvm, c, channel);
+ d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ case 0:
+ case 1:
+ case 4:
+ case 5:
+ counter = (c->count - d) & 0xffff;
+ break;
+ case 3:
+ /* XXX: may be incorrect for odd counts */
+ counter = c->count - (mod_64((2 * d), c->count));
+ break;
+ default:
+ counter = c->count - mod_64(d, c->count);
+ break;
+ }
+ return counter;
+}
+
+static int pit_get_out(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+ s64 d, t;
+ int out;
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ t = kpit_elapsed(kvm, c, channel);
+ d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ default:
+ case 0:
+ out = (d >= c->count);
+ break;
+ case 1:
+ out = (d < c->count);
+ break;
+ case 2:
+ out = ((mod_64(d, c->count) == 0) && (d != 0));
+ break;
+ case 3:
+ out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
+ break;
+ case 4:
+ case 5:
+ out = (d == c->count);
+ break;
+ }
+
+ return out;
+}
+
+static void pit_latch_count(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ if (!c->count_latched) {
+ c->latched_count = pit_get_count(kvm, channel);
+ c->count_latched = c->rw_mode;
+ }
+}
+
+static void pit_latch_status(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ if (!c->status_latched) {
+ /* TODO: Return NULL COUNT (bit 6). */
+ c->status = ((pit_get_out(kvm, channel) << 7) |
+ (c->rw_mode << 4) |
+ (c->mode << 1) |
+ c->bcd);
+ c->status_latched = 1;
+ }
+}
+
+static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+ irq_ack_notifier);
+ int value;
+
+ spin_lock(&ps->inject_lock);
+ value = atomic_dec_return(&ps->pit_timer.pending);
+ if (value < 0)
+ /* spurious acks can be generated if, for example, the
+ * PIC is being reset. Handle it gracefully here
+ */
+ atomic_inc(&ps->pit_timer.pending);
+ else if (value > 0)
+ /* in this case, we had multiple outstanding pit interrupts
+ * that we needed to inject. Reinject
+ */
+ queue_work(ps->pit->wq, &ps->pit->expired);
+ ps->irq_ack = 1;
+ spin_unlock(&ps->inject_lock);
+}
+
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+ struct hrtimer *timer;
+
+ if (!kvm_vcpu_is_bsp(vcpu) || !pit)
+ return;
+
+ timer = &pit->pit_state.pit_timer.timer;
+ if (hrtimer_cancel(timer))
+ kvm_hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+}
+
+static void destroy_pit_timer(struct kvm_pit *pit)
+{
+ hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+ cancel_work_sync(&pit->expired);
+}
+
+static bool kpit_is_periodic(struct kvm_timer *ktimer)
+{
+ struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
+ pit_timer);
+ return ps->is_periodic;
+}
+
+static struct kvm_timer_ops kpit_ops = {
+ .is_periodic = kpit_is_periodic,
+};
+
+static void pit_do_work(struct work_struct *work)
+{
+ struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+ struct kvm *kvm = pit->kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ int inject = 0;
+
+ /* Try to inject pending interrupts when
+ * last one has been acked.
+ */
+ spin_lock(&ps->inject_lock);
+ if (ps->irq_ack) {
+ ps->irq_ack = 0;
+ inject = 1;
+ }
+ spin_unlock(&ps->inject_lock);
+ if (inject) {
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> PIC -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation is simplified, only
+ * propagating PIT interrupts to all VCPUs when they have set
+ * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+ * VCPU0, and only if its LVT0 is in EXTINT mode.
+ */
+ if (kvm->arch.vapics_in_nmi_mode > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
+ }
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+
+ if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ queue_work(pt->wq, &pt->expired);
+ }
+
+ if (ktimer->t_ops->is_periodic(ktimer)) {
+ kvm_hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
+static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
+{
+ struct kvm_timer *pt = &ps->pit_timer;
+ s64 interval;
+
+ interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+
+ pr_debug("pit: " "create pit timer, interval is %llu nsec\n", interval);
+
+ /* TODO The new value only affected after the retriggered */
+ hrtimer_cancel(&pt->timer);
+ cancel_work_sync(&ps->pit->expired);
+ pt->period = interval;
+ ps->is_periodic = is_period;
+
+ pt->timer.function = pit_timer_fn;
+ pt->t_ops = &kpit_ops;
+ pt->kvm = ps->pit->kvm;
+
+ atomic_set(&pt->pending, 0);
+ ps->irq_ack = 1;
+
+ hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+ HRTIMER_MODE_ABS);
+}
+
+static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+{
+ struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+ WARN_ON(!mutex_is_locked(&ps->lock));
+
+ pr_debug("pit: " "load_count val is %d, channel is %d\n", val, channel);
+
+ /*
+ * The largest possible initial count is 0; this is equivalent
+ * to 216 for binary counting and 104 for BCD counting.
+ */
+ if (val == 0)
+ val = 0x10000;
+
+ ps->channels[channel].count = val;
+
+ if (channel != 0) {
+ ps->channels[channel].count_load_time = ktime_get();
+ return;
+ }
+
+ /* Two types of timer
+ * mode 1 is one shot, mode 2 is period, otherwise del timer */
+ switch (ps->channels[0].mode) {
+ case 0:
+ case 1:
+ /* FIXME: enhance mode 4 precision */
+ case 4:
+ if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
+ create_pit_timer(ps, val, 0);
+ }
+ break;
+ case 2:
+ case 3:
+ if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
+ create_pit_timer(ps, val, 1);
+ }
+ break;
+ default:
+ destroy_pit_timer(kvm->arch.vpit);
+ }
+}
+
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
+{
+ u8 saved_mode;
+ if (hpet_legacy_start) {
+ /* save existing mode for later reenablement */
+ saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
+ kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
+ pit_load_count(kvm, channel, val);
+ kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+ } else {
+ pit_load_count(kvm, channel, val);
+ }
+}
+
+static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, dev);
+}
+
+static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, speaker_dev);
+}
+
+static inline int pit_in_range(gpa_t addr)
+{
+ return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+ (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ int channel, access;
+ struct kvm_kpit_channel_state *s;
+ u32 val = *(u32 *) data;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
+
+ val &= 0xff;
+ addr &= KVM_PIT_CHANNEL_MASK;
+
+ mutex_lock(&pit_state->lock);
+
+ if (val != 0)
+ pr_debug("pit: " "write addr is 0x%x, len is %d, val is 0x%x\n",
+ (unsigned int)addr, len, val);
+
+ if (addr == 3) {
+ channel = val >> 6;
+ if (channel == 3) {
+ /* Read-Back Command. */
+ for (channel = 0; channel < 3; channel++) {
+ s = &pit_state->channels[channel];
+ if (val & (2 << channel)) {
+ if (!(val & 0x20))
+ pit_latch_count(kvm, channel);
+ if (!(val & 0x10))
+ pit_latch_status(kvm, channel);
+ }
+ }
+ } else {
+ /* Select Counter <channel>. */
+ s = &pit_state->channels[channel];
+ access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
+ if (access == 0) {
+ pit_latch_count(kvm, channel);
+ } else {
+ s->rw_mode = access;
+ s->read_state = access;
+ s->write_state = access;
+ s->mode = (val >> 1) & 7;
+ if (s->mode > 5)
+ s->mode -= 4;
+ s->bcd = val & 1;
+ }
+ }
+ } else {
+ /* Write Count. */
+ s = &pit_state->channels[addr];
+ switch (s->write_state) {
+ default:
+ case RW_STATE_LSB:
+ pit_load_count(kvm, addr, val);
+ break;
+ case RW_STATE_MSB:
+ pit_load_count(kvm, addr, val << 8);
+ break;
+ case RW_STATE_WORD0:
+ s->write_latch = val;
+ s->write_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ pit_load_count(kvm, addr, s->write_latch | (val << 8));
+ s->write_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int pit_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ int ret, count;
+ struct kvm_kpit_channel_state *s;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
+
+ addr &= KVM_PIT_CHANNEL_MASK;
+ if (addr == 3)
+ return 0;
+
+ s = &pit_state->channels[addr];
+
+ mutex_lock(&pit_state->lock);
+
+ if (s->status_latched) {
+ s->status_latched = 0;
+ ret = s->status;
+ } else if (s->count_latched) {
+ switch (s->count_latched) {
+ default:
+ case RW_STATE_LSB:
+ ret = s->latched_count & 0xff;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_MSB:
+ ret = s->latched_count >> 8;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_WORD0:
+ ret = s->latched_count & 0xff;
+ s->count_latched = RW_STATE_MSB;
+ break;
+ }
+ } else {
+ switch (s->read_state) {
+ default:
+ case RW_STATE_LSB:
+ count = pit_get_count(kvm, addr);
+ ret = count & 0xff;
+ break;
+ case RW_STATE_MSB:
+ count = pit_get_count(kvm, addr);
+ ret = (count >> 8) & 0xff;
+ break;
+ case RW_STATE_WORD0:
+ count = pit_get_count(kvm, addr);
+ ret = count & 0xff;
+ s->read_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ count = pit_get_count(kvm, addr);
+ ret = (count >> 8) & 0xff;
+ s->read_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int speaker_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = speaker_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ u32 val = *(u32 *) data;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&pit_state->lock);
+ pit_state->speaker_data_on = (val >> 1) & 1;
+ pit_set_gate(kvm, 2, val & 1);
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int speaker_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = speaker_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ unsigned int refresh_clock;
+ int ret;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
+
+ /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
+ refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
+
+ mutex_lock(&pit_state->lock);
+ ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
+ (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+void kvm_pit_reset(struct kvm_pit *pit)
+{
+ int i;
+ struct kvm_kpit_channel_state *c;
+
+ mutex_lock(&pit->pit_state.lock);
+ pit->pit_state.flags = 0;
+ for (i = 0; i < 3; i++) {
+ c = &pit->pit_state.channels[i];
+ c->mode = 0xff;
+ c->gate = (i != 2);
+ pit_load_count(pit->kvm, i, 0);
+ }
+ mutex_unlock(&pit->pit_state.lock);
+
+ atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ pit->pit_state.irq_ack = 1;
+}
+
+static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+{
+ struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
+
+ if (!mask) {
+ atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ pit->pit_state.irq_ack = 1;
+ }
+}
+
+static const struct kvm_io_device_ops pit_dev_ops = {
+ .read = pit_ioport_read,
+ .write = pit_ioport_write,
+};
+
+static const struct kvm_io_device_ops speaker_dev_ops = {
+ .read = speaker_ioport_read,
+ .write = speaker_ioport_write,
+};
+
+/* Caller must hold slots_lock */
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
+{
+ struct kvm_pit *pit;
+ struct kvm_kpit_state *pit_state;
+ int ret;
+
+ pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+ if (!pit)
+ return NULL;
+
+ pit->irq_source_id = kvm_request_irq_source_id(kvm);
+ if (pit->irq_source_id < 0) {
+ kfree(pit);
+ return NULL;
+ }
+
+ mutex_init(&pit->pit_state.lock);
+ mutex_lock(&pit->pit_state.lock);
+ spin_lock_init(&pit->pit_state.inject_lock);
+
+ pit->wq = create_singlethread_workqueue("kvm-pit-wq");
+ if (!pit->wq) {
+ mutex_unlock(&pit->pit_state.lock);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ kfree(pit);
+ return NULL;
+ }
+ INIT_WORK(&pit->expired, pit_do_work);
+
+ kvm->arch.vpit = pit;
+ pit->kvm = kvm;
+
+ pit_state = &pit->pit_state;
+ pit_state->pit = pit;
+ hrtimer_init(&pit_state->pit_timer.timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pit_state->irq_ack_notifier.gsi = 0;
+ pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+ kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
+ pit_state->pit_timer.reinject = true;
+ mutex_unlock(&pit->pit_state.lock);
+
+ kvm_pit_reset(pit);
+
+ pit->mask_notifier.func = pit_mask_notifer;
+ kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+
+ kvm_iodevice_init(&pit->dev, &pit_dev_ops);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
+ if (ret < 0)
+ goto fail;
+
+ if (flags & KVM_PIT_SPEAKER_DUMMY) {
+ kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
+ &pit->speaker_dev);
+ if (ret < 0)
+ goto fail_unregister;
+ }
+
+ return pit;
+
+fail_unregister:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+
+fail:
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ destroy_workqueue(pit->wq);
+ kfree(pit);
+ return NULL;
+}
+
+void kvm_free_pit(struct kvm *kvm)
+{
+ struct hrtimer *timer;
+
+ if (kvm->arch.vpit) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+ &kvm->arch.vpit->speaker_dev);
+ kvm_unregister_irq_mask_notifier(kvm, 0,
+ &kvm->arch.vpit->mask_notifier);
+ kvm_unregister_irq_ack_notifier(kvm,
+ &kvm->arch.vpit->pit_state.irq_ack_notifier);
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+ hrtimer_cancel(timer);
+ cancel_work_sync(&kvm->arch.vpit->expired);
+ kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ destroy_workqueue(kvm->arch.vpit->wq);
+ kfree(kvm->arch.vpit);
+ }
+}
diff --git a/linux/x86/i8254.h b/linux/x86/i8254.h
new file mode 100644
index 0000000..46d08ca
--- /dev/null
+++ b/linux/x86/i8254.h
@@ -0,0 +1,60 @@
+#ifndef __I8254_H
+#define __I8254_H
+
+#include "iodev.h"
+
+struct kvm_kpit_channel_state {
+ u32 count; /* can be 65536 */
+ u16 latched_count;
+ u8 count_latched;
+ u8 status_latched;
+ u8 status;
+ u8 read_state;
+ u8 write_state;
+ u8 write_latch;
+ u8 rw_mode;
+ u8 mode;
+ u8 bcd; /* not supported */
+ u8 gate; /* timer start */
+ ktime_t count_load_time;
+};
+
+struct kvm_kpit_state {
+ struct kvm_kpit_channel_state channels[3];
+ u32 flags;
+ struct kvm_timer pit_timer;
+ bool is_periodic;
+ u32 speaker_data_on;
+ struct mutex lock;
+ struct kvm_pit *pit;
+ spinlock_t inject_lock;
+ unsigned long irq_ack;
+ struct kvm_irq_ack_notifier irq_ack_notifier;
+};
+
+struct kvm_pit {
+ unsigned long base_addresss;
+ struct kvm_io_device dev;
+ struct kvm_io_device speaker_dev;
+ struct kvm *kvm;
+ struct kvm_kpit_state pit_state;
+ int irq_source_id;
+ struct kvm_irq_mask_notifier mask_notifier;
+ struct workqueue_struct *wq;
+ struct work_struct expired;
+};
+
+#define KVM_PIT_BASE_ADDRESS 0x40
+#define KVM_SPEAKER_BASE_ADDRESS 0x61
+#define KVM_PIT_MEM_LENGTH 4
+#define KVM_PIT_FREQ 1193181
+#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
+#define KVM_PIT_CHANNEL_MASK 0x3
+
+void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
+void kvm_free_pit(struct kvm *kvm);
+void kvm_pit_reset(struct kvm_pit *pit);
+
+#endif
diff --git a/linux/x86/i8259.c b/linux/x86/i8259.c
new file mode 100644
index 0000000..d316fc6
--- /dev/null
+++ b/linux/x86/i8259.c
@@ -0,0 +1,645 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * 8259 interrupt controller emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ * Port from Qemu.
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include "irq.h"
+
+#include <linux/kvm_host.h>
+#include "trace.h"
+
+static void pic_irq_request(struct kvm *kvm, int level);
+
+static void pic_lock(struct kvm_pic *s)
+ __acquires(&s->lock)
+{
+ spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+ __releases(&s->lock)
+{
+ bool wakeup = s->wakeup_needed;
+ struct kvm_vcpu *vcpu, *found = NULL;
+ int i;
+
+ s->wakeup_needed = false;
+
+ spin_unlock(&s->lock);
+
+ if (wakeup) {
+ kvm_for_each_vcpu(i, vcpu, s->kvm) {
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ found = vcpu;
+ break;
+ }
+ }
+
+ if (!found)
+ found = s->kvm->bsp_vcpu;
+
+ if (!found)
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, found);
+ kvm_vcpu_kick(found);
+ }
+}
+
+static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
+{
+ s->isr &= ~(1 << irq);
+ s->isr_ack |= (1 << irq);
+ if (s != &s->pics_state->pics[0])
+ irq += 8;
+ /*
+ * We are dropping lock while calling ack notifiers since ack
+ * notifier callbacks for assigned devices call into PIC recursively.
+ * Other interrupt may be delivered to PIC while lock is dropped but
+ * it should be safe since PIC state is already updated at this stage.
+ */
+ pic_unlock(s->pics_state);
+ kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
+ pic_lock(s->pics_state);
+}
+
+void kvm_pic_clear_isr_ack(struct kvm *kvm)
+{
+ struct kvm_pic *s = pic_irqchip(kvm);
+
+ pic_lock(s);
+ s->pics[0].isr_ack = 0xff;
+ s->pics[1].isr_ack = 0xff;
+ pic_unlock(s);
+}
+
+/*
+ * set irq level. If an edge is detected, then the IRR is set to 1
+ */
+static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+{
+ int mask, ret = 1;
+ mask = 1 << irq;
+ if (s->elcr & mask) /* level triggered */
+ if (level) {
+ ret = !(s->irr & mask);
+ s->irr |= mask;
+ s->last_irr |= mask;
+ } else {
+ s->irr &= ~mask;
+ s->last_irr &= ~mask;
+ }
+ else /* edge triggered */
+ if (level) {
+ if ((s->last_irr & mask) == 0) {
+ ret = !(s->irr & mask);
+ s->irr |= mask;
+ }
+ s->last_irr |= mask;
+ } else
+ s->last_irr &= ~mask;
+
+ return (s->imr & mask) ? -1 : ret;
+}
+
+/*
+ * return the highest priority found in mask (highest = smallest
+ * number). Return 8 if no irq
+ */
+static inline int get_priority(struct kvm_kpic_state *s, int mask)
+{
+ int priority;
+ if (mask == 0)
+ return 8;
+ priority = 0;
+ while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
+ priority++;
+ return priority;
+}
+
+/*
+ * return the pic wanted interrupt. return -1 if none
+ */
+static int pic_get_irq(struct kvm_kpic_state *s)
+{
+ int mask, cur_priority, priority;
+
+ mask = s->irr & ~s->imr;
+ priority = get_priority(s, mask);
+ if (priority == 8)
+ return -1;
+ /*
+ * compute current priority. If special fully nested mode on the
+ * master, the IRQ coming from the slave is not taken into account
+ * for the priority computation.
+ */
+ mask = s->isr;
+ if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
+ mask &= ~(1 << 2);
+ cur_priority = get_priority(s, mask);
+ if (priority < cur_priority)
+ /*
+ * higher priority found: an irq should be generated
+ */
+ return (priority + s->priority_add) & 7;
+ else
+ return -1;
+}
+
+/*
+ * raise irq to CPU if necessary. must be called every time the active
+ * irq may change
+ */
+static void pic_update_irq(struct kvm_pic *s)
+{
+ int irq2, irq;
+
+ irq2 = pic_get_irq(&s->pics[1]);
+ if (irq2 >= 0) {
+ /*
+ * if irq request by slave pic, signal master PIC
+ */
+ pic_set_irq1(&s->pics[0], 2, 1);
+ pic_set_irq1(&s->pics[0], 2, 0);
+ }
+ irq = pic_get_irq(&s->pics[0]);
+ pic_irq_request(s->kvm, irq >= 0);
+}
+
+void kvm_pic_update_irq(struct kvm_pic *s)
+{
+ pic_lock(s);
+ pic_update_irq(s);
+ pic_unlock(s);
+}
+
+int kvm_pic_set_irq(void *opaque, int irq, int level)
+{
+ struct kvm_pic *s = opaque;
+ int ret = -1;
+
+ pic_lock(s);
+ if (irq >= 0 && irq < PIC_NUM_PINS) {
+ ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+ pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
+ }
+ pic_unlock(s);
+
+ return ret;
+}
+
+/*
+ * acknowledge interrupt 'irq'
+ */
+static inline void pic_intack(struct kvm_kpic_state *s, int irq)
+{
+ s->isr |= 1 << irq;
+ /*
+ * We don't clear a level sensitive interrupt here
+ */
+ if (!(s->elcr & (1 << irq)))
+ s->irr &= ~(1 << irq);
+
+ if (s->auto_eoi) {
+ if (s->rotate_on_auto_eoi)
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ }
+
+}
+
+int kvm_pic_read_irq(struct kvm *kvm)
+{
+ int irq, irq2, intno;
+ struct kvm_pic *s = pic_irqchip(kvm);
+
+ pic_lock(s);
+ irq = pic_get_irq(&s->pics[0]);
+ if (irq >= 0) {
+ pic_intack(&s->pics[0], irq);
+ if (irq == 2) {
+ irq2 = pic_get_irq(&s->pics[1]);
+ if (irq2 >= 0)
+ pic_intack(&s->pics[1], irq2);
+ else
+ /*
+ * spurious IRQ on slave controller
+ */
+ irq2 = 7;
+ intno = s->pics[1].irq_base + irq2;
+ irq = irq2 + 8;
+ } else
+ intno = s->pics[0].irq_base + irq;
+ } else {
+ /*
+ * spurious IRQ on host controller
+ */
+ irq = 7;
+ intno = s->pics[0].irq_base + irq;
+ }
+ pic_update_irq(s);
+ pic_unlock(s);
+
+ return intno;
+}
+
+void kvm_pic_reset(struct kvm_kpic_state *s)
+{
+ int irq;
+ struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
+ u8 irr = s->irr, isr = s->imr;
+
+ s->last_irr = 0;
+ s->irr = 0;
+ s->imr = 0;
+ s->isr = 0;
+ s->isr_ack = 0xff;
+ s->priority_add = 0;
+ s->irq_base = 0;
+ s->read_reg_select = 0;
+ s->poll = 0;
+ s->special_mask = 0;
+ s->init_state = 0;
+ s->auto_eoi = 0;
+ s->rotate_on_auto_eoi = 0;
+ s->special_fully_nested_mode = 0;
+ s->init4 = 0;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
+ if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
+ if (irr & (1 << irq) || isr & (1 << irq)) {
+ pic_clear_isr(s, irq);
+ }
+ }
+}
+
+static void pic_ioport_write(void *opaque, u32 addr, u32 val)
+{
+ struct kvm_kpic_state *s = opaque;
+ int priority, cmd, irq;
+
+ addr &= 1;
+ if (addr == 0) {
+ if (val & 0x10) {
+ s->init4 = val & 1;
+ s->last_irr = 0;
+ s->imr = 0;
+ s->priority_add = 0;
+ s->special_mask = 0;
+ s->read_reg_select = 0;
+ if (!s->init4) {
+ s->special_fully_nested_mode = 0;
+ s->auto_eoi = 0;
+ }
+ s->init_state = 1;
+ if (val & 0x02)
+ printk(KERN_ERR "single mode not supported");
+ if (val & 0x08)
+ printk(KERN_ERR
+ "level sensitive irq not supported");
+ } else if (val & 0x08) {
+ if (val & 0x04)
+ s->poll = 1;
+ if (val & 0x02)
+ s->read_reg_select = val & 1;
+ if (val & 0x40)
+ s->special_mask = (val >> 5) & 1;
+ } else {
+ cmd = val >> 5;
+ switch (cmd) {
+ case 0:
+ case 4:
+ s->rotate_on_auto_eoi = cmd >> 2;
+ break;
+ case 1: /* end of interrupt */
+ case 5:
+ priority = get_priority(s, s->isr);
+ if (priority != 8) {
+ irq = (priority + s->priority_add) & 7;
+ if (cmd == 5)
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ }
+ break;
+ case 3:
+ irq = val & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ break;
+ case 6:
+ s->priority_add = (val + 1) & 7;
+ pic_update_irq(s->pics_state);
+ break;
+ case 7:
+ irq = val & 7;
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ break;
+ default:
+ break; /* no operation */
+ }
+ }
+ } else
+ switch (s->init_state) {
+ case 0: { /* normal mode */
+ u8 imr_diff = s->imr ^ val,
+ off = (s == &s->pics_state->pics[0]) ? 0 : 8;
+ s->imr = val;
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (imr_diff & (1 << irq))
+ kvm_fire_mask_notifiers(
+ s->pics_state->kvm,
+ SELECT_PIC(irq + off),
+ irq + off,
+ !!(s->imr & (1 << irq)));
+ pic_update_irq(s->pics_state);
+ break;
+ }
+ case 1:
+ s->irq_base = val & 0xf8;
+ s->init_state = 2;
+ break;
+ case 2:
+ if (s->init4)
+ s->init_state = 3;
+ else
+ s->init_state = 0;
+ break;
+ case 3:
+ s->special_fully_nested_mode = (val >> 4) & 1;
+ s->auto_eoi = (val >> 1) & 1;
+ s->init_state = 0;
+ break;
+ }
+}
+
+static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
+{
+ int ret;
+
+ ret = pic_get_irq(s);
+ if (ret >= 0) {
+ if (addr1 >> 7) {
+ s->pics_state->pics[0].isr &= ~(1 << 2);
+ s->pics_state->pics[0].irr &= ~(1 << 2);
+ }
+ s->irr &= ~(1 << ret);
+ pic_clear_isr(s, ret);
+ if (addr1 >> 7 || ret != 2)
+ pic_update_irq(s->pics_state);
+ } else {
+ ret = 0x07;
+ pic_update_irq(s->pics_state);
+ }
+
+ return ret;
+}
+
+static u32 pic_ioport_read(void *opaque, u32 addr1)
+{
+ struct kvm_kpic_state *s = opaque;
+ unsigned int addr;
+ int ret;
+
+ addr = addr1;
+ addr &= 1;
+ if (s->poll) {
+ ret = pic_poll_read(s, addr1);
+ s->poll = 0;
+ } else
+ if (addr == 0)
+ if (s->read_reg_select)
+ ret = s->isr;
+ else
+ ret = s->irr;
+ else
+ ret = s->imr;
+ return ret;
+}
+
+static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+{
+ struct kvm_kpic_state *s = opaque;
+ s->elcr = val & s->elcr_mask;
+}
+
+static u32 elcr_ioport_read(void *opaque, u32 addr1)
+{
+ struct kvm_kpic_state *s = opaque;
+ return s->elcr;
+}
+
+static int picdev_in_range(gpa_t addr)
+{
+ switch (addr) {
+ case 0x20:
+ case 0x21:
+ case 0xa0:
+ case 0xa1:
+ case 0x4d0:
+ case 0x4d1:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pic, dev);
+}
+
+static int picdev_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *val)
+{
+ struct kvm_pic *s = to_pic(this);
+ unsigned char data = *(unsigned char *)val;
+ if (!picdev_in_range(addr))
+ return -EOPNOTSUPP;
+
+ if (len != 1) {
+ if (printk_ratelimit())
+ printk(KERN_ERR "PIC: non byte write\n");
+ return 0;
+ }
+ pic_lock(s);
+ switch (addr) {
+ case 0x20:
+ case 0x21:
+ case 0xa0:
+ case 0xa1:
+ pic_ioport_write(&s->pics[addr >> 7], addr, data);
+ break;
+ case 0x4d0:
+ case 0x4d1:
+ elcr_ioport_write(&s->pics[addr & 1], addr, data);
+ break;
+ }
+ pic_unlock(s);
+ return 0;
+}
+
+static int picdev_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *val)
+{
+ struct kvm_pic *s = to_pic(this);
+ unsigned char data = 0;
+ if (!picdev_in_range(addr))
+ return -EOPNOTSUPP;
+
+ if (len != 1) {
+ if (printk_ratelimit())
+ printk(KERN_ERR "PIC: non byte read\n");
+ return 0;
+ }
+ pic_lock(s);
+ switch (addr) {
+ case 0x20:
+ case 0x21:
+ case 0xa0:
+ case 0xa1:
+ data = pic_ioport_read(&s->pics[addr >> 7], addr);
+ break;
+ case 0x4d0:
+ case 0x4d1:
+ data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ break;
+ }
+ *(unsigned char *)val = data;
+ pic_unlock(s);
+ return 0;
+}
+
+/*
+ * callback when PIC0 irq status changed
+ */
+static void pic_irq_request(struct kvm *kvm, int level)
+{
+ struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
+ struct kvm_pic *s = pic_irqchip(kvm);
+ int irq = pic_get_irq(&s->pics[0]);
+
+ s->output = level;
+ if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
+ s->pics[0].isr_ack &= ~(1 << irq);
+ s->wakeup_needed = true;
+ }
+}
+
+static const struct kvm_io_device_ops picdev_ops = {
+ .read = picdev_read,
+ .write = picdev_write,
+};
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+{
+ struct kvm_pic *s;
+ int ret;
+
+ s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+ if (!s)
+ return NULL;
+ spin_lock_init(&s->lock);
+ s->kvm = kvm;
+ s->pics[0].elcr_mask = 0xf8;
+ s->pics[1].elcr_mask = 0xde;
+ s->pics[0].pics_state = s;
+ s->pics[1].pics_state = s;
+ s->pics[0].isr_ack = 0xff;
+ s->pics[1].isr_ack = 0xff;
+
+ /*
+ * Initialize PIO device
+ */
+ kvm_iodevice_init(&s->dev, &picdev_ops);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kfree(s);
+ return NULL;
+ }
+
+ return s;
+}
+
+void kvm_destroy_pic(struct kvm *kvm)
+{
+ struct kvm_pic *vpic = kvm->arch.vpic;
+
+ if (vpic) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
+ kvm->arch.vpic = NULL;
+ kfree(vpic);
+ }
+}
diff --git a/linux/x86/ioapic.c b/linux/x86/ioapic.c
new file mode 100644
index 0000000..f19b670
--- /dev/null
+++ b/linux/x86/ioapic.c
@@ -0,0 +1,481 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * Copyright (C) 2001 MandrakeSoft S.A.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * MandrakeSoft S.A.
+ * 43, rue d'Aboukir
+ * 75002 Paris - France
+ * http://www.linux-mandrake.com/
+ * http://www.mandrakesoft.com/
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Yunhong Jiang <yunhong.jiang@intel.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ * Based on Xen 3.1 code.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <trace/events/kvm.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+#include "irq.h"
+
+#if 0
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
+#else
+#define ioapic_debug(fmt, arg...)
+#endif
+static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
+ unsigned long addr,
+ unsigned long length)
+{
+ unsigned long result = 0;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+ | (IOAPIC_VERSION_ID & 0xff));
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ case IOAPIC_REG_ARB_ID:
+ result = ((ioapic->id & 0xf) << 24);
+ break;
+
+ default:
+ {
+ u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+ u64 redir_content;
+
+ ASSERT(redir_index < IOAPIC_NUM_PINS);
+
+ redir_content = ioapic->redirtbl[redir_index].bits;
+ result = (ioapic->ioregsel & 0x1) ?
+ (redir_content >> 32) & 0xffffffff :
+ redir_content & 0xffffffff;
+ break;
+ }
+ }
+
+ return result;
+}
+
+static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
+{
+ union kvm_ioapic_redirect_entry *pent;
+ int injected = -1;
+
+ pent = &ioapic->redirtbl[idx];
+
+ if (!pent->fields.mask) {
+ injected = ioapic_deliver(ioapic, idx);
+ if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
+ pent->fields.remote_irr = 1;
+ }
+
+ return injected;
+}
+
+static void update_handled_vectors(struct kvm_ioapic *ioapic)
+{
+ DECLARE_BITMAP(handled_vectors, 256);
+ int i;
+
+ memset(handled_vectors, 0, sizeof(handled_vectors));
+ for (i = 0; i < IOAPIC_NUM_PINS; ++i)
+ __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
+ memcpy(ioapic->handled_vectors, handled_vectors,
+ sizeof(handled_vectors));
+ smp_wmb();
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+ unsigned index;
+ bool mask_before, mask_after;
+ union kvm_ioapic_redirect_entry *e;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ /* Writes are ignored. */
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ ioapic->id = (val >> 24) & 0xf;
+ break;
+
+ case IOAPIC_REG_ARB_ID:
+ break;
+
+ default:
+ index = (ioapic->ioregsel - 0x10) >> 1;
+
+ ioapic_debug("change redir index %x val %x\n", index, val);
+ if (index >= IOAPIC_NUM_PINS)
+ return;
+ e = &ioapic->redirtbl[index];
+ mask_before = e->fields.mask;
+ if (ioapic->ioregsel & 1) {
+ e->bits &= 0xffffffff;
+ e->bits |= (u64) val << 32;
+ } else {
+ e->bits &= ~0xffffffffULL;
+ e->bits |= (u32) val;
+ e->fields.remote_irr = 0;
+ }
+ update_handled_vectors(ioapic);
+ mask_after = e->fields.mask;
+ if (mask_before != mask_after)
+ kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
+ && ioapic->irr & (1 << index))
+ ioapic_service(ioapic, index);
+ break;
+ }
+}
+
+static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+{
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+ struct kvm_lapic_irq irqe;
+
+ ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+ "vector=%x trig_mode=%x\n",
+ entry->fields.dest, entry->fields.dest_mode,
+ entry->fields.delivery_mode, entry->fields.vector,
+ entry->fields.trig_mode);
+
+ irqe.dest_id = entry->fields.dest_id;
+ irqe.vector = entry->fields.vector;
+ irqe.dest_mode = entry->fields.dest_mode;
+ irqe.trig_mode = entry->fields.trig_mode;
+ irqe.delivery_mode = entry->fields.delivery_mode << 8;
+ irqe.level = 1;
+ irqe.shorthand = 0;
+
+#ifdef CONFIG_X86
+ /* Always delivery PIT interrupt to vcpu 0 */
+ if (irq == 0) {
+ irqe.dest_mode = 0; /* Physical mode. */
+ /* need to read apic_id from apic regiest since
+ * it can be rewritten */
+ irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id;
+ }
+#endif
+ return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
+}
+
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+{
+ u32 old_irr;
+ u32 mask = 1 << irq;
+ union kvm_ioapic_redirect_entry entry;
+ int ret = 1;
+
+ spin_lock(&ioapic->lock);
+ old_irr = ioapic->irr;
+ if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
+ entry = ioapic->redirtbl[irq];
+ level ^= entry.fields.polarity;
+ if (!level)
+ ioapic->irr &= ~mask;
+ else {
+ int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+ ioapic->irr |= mask;
+ if ((edge && old_irr != ioapic->irr) ||
+ (!edge && !entry.fields.remote_irr))
+ ret = ioapic_service(ioapic, irq);
+ else
+ ret = 0; /* report coalesced interrupt */
+ }
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ }
+ spin_unlock(&ioapic->lock);
+
+ return ret;
+}
+
+static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
+ int trigger_mode)
+{
+ int i;
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.vector != vector)
+ continue;
+
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ spin_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
+ spin_lock(&ioapic->lock);
+
+ if (trigger_mode != IOAPIC_LEVEL_TRIG)
+ continue;
+
+ ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+ ent->fields.remote_irr = 0;
+ if (!ent->fields.mask && (ioapic->irr & (1 << i)))
+ ioapic_service(ioapic, i);
+ }
+}
+
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ smp_rmb();
+ if (!test_bit(vector, ioapic->handled_vectors))
+ return;
+ spin_lock(&ioapic->lock);
+ __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
+ spin_unlock(&ioapic->lock);
+}
+
+static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_ioapic, dev);
+}
+
+static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
+{
+ return ((addr >= ioapic->base_address &&
+ (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+ void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 result;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ioapic_debug("addr %lx\n", (unsigned long)addr);
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ result = ioapic->ioregsel;
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ result = ioapic_read_indirect(ioapic, addr, len);
+ break;
+
+ default:
+ result = 0;
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+
+ switch (len) {
+ case 8:
+ *(u64 *) val = result;
+ break;
+ case 1:
+ case 2:
+ case 4:
+ memcpy(val, (char *)&result, len);
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+ }
+ return 0;
+}
+
+static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
+ const void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 data;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
+ (void*)addr, len, val);
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ if (len == 4 || len == 8)
+ data = *(u32 *) val;
+ else {
+ printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+ return 0;
+ }
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ ioapic->ioregsel = data;
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ ioapic_write_indirect(ioapic, data);
+ break;
+#ifdef CONFIG_IA64
+ case IOAPIC_REG_EOI:
+ __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG);
+ break;
+#endif
+
+ default:
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+ int i;
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
+ ioapic->redirtbl[i].fields.mask = 1;
+ ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+ ioapic->ioregsel = 0;
+ ioapic->irr = 0;
+ ioapic->id = 0;
+ update_handled_vectors(ioapic);
+}
+
+static const struct kvm_io_device_ops ioapic_mmio_ops = {
+ .read = ioapic_mmio_read,
+ .write = ioapic_mmio_write,
+};
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic;
+ int ret;
+
+ ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+ if (!ioapic)
+ return -ENOMEM;
+ spin_lock_init(&ioapic->lock);
+ kvm->arch.vioapic = ioapic;
+ kvm_ioapic_reset(ioapic);
+ kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
+ ioapic->kvm = kvm;
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+
+ return ret;
+}
+
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (ioapic) {
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+}
+
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+ if (!ioapic)
+ return -EINVAL;
+
+ spin_lock(&ioapic->lock);
+ memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+ if (!ioapic)
+ return -EINVAL;
+
+ spin_lock(&ioapic->lock);
+ memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ update_handled_vectors(ioapic);
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
diff --git a/linux/x86/ioapic.h b/linux/x86/ioapic.h
new file mode 100644
index 0000000..0b190c3
--- /dev/null
+++ b/linux/x86/ioapic.h
@@ -0,0 +1,83 @@
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG 0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
+#define IOAPIC_MEM_LENGTH 0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT 0x00
+#define IOAPIC_REG_WINDOW 0x10
+#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define IOAPIC_FIXED 0x0
+#define IOAPIC_LOWEST_PRIORITY 0x1
+#define IOAPIC_PMI 0x2
+#define IOAPIC_NMI 0x4
+#define IOAPIC_INIT 0x5
+#define IOAPIC_EXTINT 0x7
+
+struct kvm_ioapic {
+ u64 base_address;
+ u32 ioregsel;
+ u32 id;
+ u32 irr;
+ u32 pad;
+ union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+ unsigned long irq_states[IOAPIC_NUM_PINS];
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+ void (*ack_notifier)(void *opaque, int irq);
+ spinlock_t lock;
+ DECLARE_BITMAP(handled_vectors, 256);
+};
+
+#ifdef DEBUG
+#define ASSERT(x) \
+do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
+{
+ return kvm->arch.vioapic;
+}
+
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int short_hand, int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq);
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+
+#endif
diff --git a/linux/x86/iodev.h b/linux/x86/iodev.h
new file mode 100644
index 0000000..12fd3ca
--- /dev/null
+++ b/linux/x86/iodev.h
@@ -0,0 +1,70 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __KVM_IODEV_H__
+#define __KVM_IODEV_H__
+
+#include <linux/kvm_types.h>
+#include <asm/errno.h>
+
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+ int (*read)(struct kvm_io_device *this,
+ gpa_t addr,
+ int len,
+ void *val);
+ int (*write)(struct kvm_io_device *this,
+ gpa_t addr,
+ int len,
+ const void *val);
+ void (*destructor)(struct kvm_io_device *this);
+};
+
+
+struct kvm_io_device {
+ const struct kvm_io_device_ops *ops;
+};
+
+static inline void kvm_iodevice_init(struct kvm_io_device *dev,
+ const struct kvm_io_device_ops *ops)
+{
+ dev->ops = ops;
+}
+
+static inline int kvm_iodevice_read(struct kvm_io_device *dev,
+ gpa_t addr, int l, void *v)
+{
+ return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
+}
+
+static inline int kvm_iodevice_write(struct kvm_io_device *dev,
+ gpa_t addr, int l, const void *v)
+{
+ return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+ if (dev->ops->destructor)
+ dev->ops->destructor(dev);
+}
+
+#endif /* __KVM_IODEV_H__ */
diff --git a/linux/x86/iommu.c b/linux/x86/iommu.c
new file mode 100644
index 0000000..381142f
--- /dev/null
+++ b/linux/x86/iommu.c
@@ -0,0 +1,356 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Author: Allen M. Kay <allen.m.kay@intel.com>
+ * Author: Weidong Han <weidong.han@intel.com>
+ * Author: Ben-Ami Yassour <benami@il.ibm.com>
+ */
+
+#include <linux/list.h>
+#include <linux/kvm_host.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm);
+static void kvm_iommu_put_pages(struct kvm *kvm,
+ gfn_t base_gfn, unsigned long npages);
+
+static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, unsigned long size)
+{
+ gfn_t end_gfn;
+ pfn_t pfn;
+
+ pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
+ end_gfn = gfn + (size >> PAGE_SHIFT);
+ gfn += 1;
+
+ if (is_error_pfn(pfn))
+ return pfn;
+
+ while (gfn < end_gfn)
+ gfn_to_pfn_memslot(kvm, slot, gfn++);
+
+ return pfn;
+}
+
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ gfn_t gfn, end_gfn;
+ pfn_t pfn;
+ int r = 0;
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+ int flags;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ gfn = slot->base_gfn;
+ end_gfn = gfn + slot->npages;
+
+ flags = IOMMU_READ | IOMMU_WRITE;
+ if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
+ flags |= IOMMU_CACHE;
+
+
+ while (gfn < end_gfn) {
+ unsigned long page_size;
+
+ /* Check if already mapped */
+ if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
+ gfn += 1;
+ continue;
+ }
+
+ /* Get the page size we could use to map */
+ page_size = kvm_host_page_size(kvm, gfn);
+
+ /* Make sure the page_size does not exceed the memslot */
+ while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
+ page_size >>= 1;
+
+ /* Make sure gfn is aligned to the page size we want to map */
+ while ((gfn << PAGE_SHIFT) & (page_size - 1))
+ page_size >>= 1;
+
+ /*
+ * Pin all pages we are about to map in memory. This is
+ * important because we unmap and unpin in 4kb steps later.
+ */
+ pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
+ if (is_error_pfn(pfn)) {
+ gfn += 1;
+ continue;
+ }
+
+ /* Map into IO address space */
+ r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
+ get_order(page_size), flags);
+ if (r) {
+ printk(KERN_ERR "kvm_iommu_map_address:"
+ "iommu failed to map pfn=%llx\n", pfn);
+ goto unmap_pages;
+ }
+
+ gfn += page_size >> PAGE_SHIFT;
+
+
+ }
+
+ return 0;
+
+unmap_pages:
+ kvm_iommu_put_pages(kvm, slot->base_gfn, gfn);
+ return r;
+}
+
+static int kvm_iommu_map_memslots(struct kvm *kvm)
+{
+ int i, idx, r = 0;
+ struct kvm_memslots *slots;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ slots = kvm_memslots(kvm);
+
+ for (i = 0; i < slots->nmemslots; i++) {
+ r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
+ if (r)
+ break;
+ }
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return r;
+}
+
+int kvm_assign_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+ int r, last_flags;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ pdev = assigned_dev->dev;
+ if (pdev == NULL)
+ return -ENODEV;
+
+ r = iommu_attach_device(domain, &pdev->dev);
+ if (r) {
+ printk(KERN_ERR "assign device %x:%x:%x.%x failed",
+ pci_domain_nr(pdev->bus),
+ pdev->bus->number,
+ PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn));
+ return r;
+ }
+
+ last_flags = kvm->arch.iommu_flags;
+ if (iommu_domain_has_cap(kvm->arch.iommu_domain,
+ IOMMU_CAP_CACHE_COHERENCY))
+ kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
+
+ /* Check if need to update IOMMU page table for guest memory */
+ if ((last_flags ^ kvm->arch.iommu_flags) ==
+ KVM_IOMMU_CACHE_COHERENCY) {
+ kvm_iommu_unmap_memslots(kvm);
+ r = kvm_iommu_map_memslots(kvm);
+ if (r)
+ goto out_unmap;
+ }
+
+ printk(KERN_DEBUG "assign device %x:%x:%x.%x\n",
+ assigned_dev->host_segnr,
+ assigned_dev->host_busnr,
+ PCI_SLOT(assigned_dev->host_devfn),
+ PCI_FUNC(assigned_dev->host_devfn));
+
+ return 0;
+out_unmap:
+ kvm_iommu_unmap_memslots(kvm);
+ return r;
+}
+
+int kvm_deassign_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+ struct pci_dev *pdev = NULL;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ pdev = assigned_dev->dev;
+ if (pdev == NULL)
+ return -ENODEV;
+
+ iommu_detach_device(domain, &pdev->dev);
+
+ printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n",
+ assigned_dev->host_segnr,
+ assigned_dev->host_busnr,
+ PCI_SLOT(assigned_dev->host_devfn),
+ PCI_FUNC(assigned_dev->host_devfn));
+
+ return 0;
+}
+
+int kvm_iommu_map_guest(struct kvm *kvm)
+{
+ int r;
+
+ if (!iommu_found()) {
+ printk(KERN_ERR "%s: iommu not found\n", __func__);
+ return -ENODEV;
+ }
+
+ kvm->arch.iommu_domain = iommu_domain_alloc();
+ if (!kvm->arch.iommu_domain)
+ return -ENOMEM;
+
+ r = kvm_iommu_map_memslots(kvm);
+ if (r)
+ goto out_unmap;
+
+ return 0;
+
+out_unmap:
+ kvm_iommu_unmap_memslots(kvm);
+ return r;
+}
+
+static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; ++i)
+ kvm_release_pfn_clean(pfn + i);
+}
+
+static void kvm_iommu_put_pages(struct kvm *kvm,
+ gfn_t base_gfn, unsigned long npages)
+{
+ struct iommu_domain *domain;
+ gfn_t end_gfn, gfn;
+ pfn_t pfn;
+ u64 phys;
+
+ domain = kvm->arch.iommu_domain;
+ end_gfn = base_gfn + npages;
+ gfn = base_gfn;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return;
+
+ while (gfn < end_gfn) {
+ unsigned long unmap_pages;
+ int order;
+
+ /* Get physical address */
+ phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
+ pfn = phys >> PAGE_SHIFT;
+
+ /* Unmap address from IO address space */
+ order = iommu_unmap(domain, gfn_to_gpa(gfn), 0);
+ unmap_pages = 1ULL << order;
+
+ /* Unpin all pages we just unmapped to not leak any memory */
+ kvm_unpin_pages(kvm, pfn, unmap_pages);
+
+ gfn += unmap_pages;
+ }
+}
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm)
+{
+ int i, idx;
+ struct kvm_memslots *slots;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ slots = kvm_memslots(kvm);
+
+ for (i = 0; i < slots->nmemslots; i++) {
+ kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
+ slots->memslots[i].npages);
+ }
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+
+int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ kvm_iommu_unmap_memslots(kvm);
+ iommu_domain_free(domain);
+ return 0;
+}
diff --git a/linux/x86/irq.c b/linux/x86/irq.c
new file mode 100644
index 0000000..d1372c0
--- /dev/null
+++ b/linux/x86/irq.c
@@ -0,0 +1,136 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * irq.c: API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "i8254.h"
+#include "x86.h"
+
+/*
+ * check if there are pending timer events
+ * to be processed.
+ */
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return apic_has_pending_timer(vcpu);
+}
+EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
+
+/*
+ * check if there is pending interrupt without
+ * intack.
+ */
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
+{
+ struct kvm_pic *s;
+
+ if (!irqchip_in_kernel(v->kvm))
+ return v->arch.interrupt.pending;
+
+ if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
+ if (kvm_apic_accept_pic_intr(v)) {
+ s = pic_irqchip(v->kvm); /* PIC */
+ return s->output;
+ } else
+ return 0;
+ }
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
+
+/*
+ * Read pending interrupt vector and intack.
+ */
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
+{
+ struct kvm_pic *s;
+ int vector;
+
+ if (!irqchip_in_kernel(v->kvm))
+ return v->arch.interrupt.nr;
+
+ vector = kvm_get_apic_interrupt(v); /* APIC */
+ if (vector == -1) {
+ if (kvm_apic_accept_pic_intr(v)) {
+ s = pic_irqchip(v->kvm);
+ s->output = 0; /* PIC */
+ vector = kvm_pic_read_irq(v->kvm);
+ }
+ }
+ return vector;
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ kvm_inject_apic_timer_irqs(vcpu);
+ /* TODO: PIT, RTC etc. */
+}
+EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
+
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ __kvm_migrate_apic_timer(vcpu);
+ __kvm_migrate_pit_timer(vcpu);
+}
diff --git a/linux/x86/irq.h b/linux/x86/irq.h
new file mode 100644
index 0000000..90f1923
--- /dev/null
+++ b/linux/x86/irq.h
@@ -0,0 +1,106 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/mm.h>
+#include <linux/hrtimer.h>
+#include <linux/kvm_host.h>
+#include <linux/spinlock.h>
+
+#include "iodev.h"
+#include "ioapic.h"
+#include "lapic.h"
+
+#define PIC_NUM_PINS 16
+#define SELECT_PIC(irq) \
+ ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
+
+struct kvm;
+struct kvm_vcpu;
+
+struct kvm_kpic_state {
+ u8 last_irr; /* edge detection */
+ u8 irr; /* interrupt request register */
+ u8 imr; /* interrupt mask register */
+ u8 isr; /* interrupt service register */
+ u8 priority_add; /* highest irq priority */
+ u8 irq_base;
+ u8 read_reg_select;
+ u8 poll;
+ u8 special_mask;
+ u8 init_state;
+ u8 auto_eoi;
+ u8 rotate_on_auto_eoi;
+ u8 special_fully_nested_mode;
+ u8 init4; /* true if 4 byte init */
+ u8 elcr; /* PIIX edge/trigger selection */
+ u8 elcr_mask;
+ u8 isr_ack; /* interrupt ack detection */
+ struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+ spinlock_t lock;
+ bool wakeup_needed;
+ unsigned pending_acks;
+ struct kvm *kvm;
+ struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+ int output; /* intr from master PIC */
+ struct kvm_io_device dev;
+ void (*ack_notifier)(void *opaque, int irq);
+ unsigned long irq_states[16];
+};
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_destroy_pic(struct kvm *kvm);
+int kvm_pic_read_irq(struct kvm *kvm);
+void kvm_pic_update_irq(struct kvm_pic *s);
+void kvm_pic_clear_isr_ack(struct kvm *kvm);
+
+static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+{
+ return kvm->arch.vpic;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ int ret;
+
+ ret = (pic_irqchip(kvm) != NULL);
+ smp_rmb();
+ return ret;
+}
+
+void kvm_pic_reset(struct kvm_kpic_state *s);
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
+
+int pit_has_pending_timer(struct kvm_vcpu *vcpu);
+int apic_has_pending_timer(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/linux/x86/irq_comm.c b/linux/x86/irq_comm.c
new file mode 100644
index 0000000..646d331
--- /dev/null
+++ b/linux/x86/irq_comm.c
@@ -0,0 +1,514 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * irq_comm.c: Common API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <trace/events/kvm.h>
+
+#include <asm/msidef.h>
+#ifdef CONFIG_IA64
+#include <asm/iosapic.h>
+#endif
+
+#include "irq.h"
+
+#include "ioapic.h"
+
+static inline int kvm_irq_line_state(unsigned long *irq_state,
+ int irq_source_id, int level)
+{
+ /* Logical OR for level trig interrupt */
+ if (level)
+ set_bit(irq_source_id, irq_state);
+ else
+ clear_bit(irq_source_id, irq_state);
+
+ return !!(*irq_state);
+}
+
+static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level)
+{
+#ifdef CONFIG_X86
+ struct kvm_pic *pic = pic_irqchip(kvm);
+ level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
+ irq_source_id, level);
+ return kvm_pic_set_irq(pic, e->irqchip.pin, level);
+#else
+ return -1;
+#endif
+}
+
+static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
+ irq_source_id, level);
+
+ return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
+}
+
+inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
+{
+#ifdef CONFIG_IA64
+ return irq->delivery_mode ==
+ (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
+#else
+ return irq->delivery_mode == APIC_DM_LOWEST;
+#endif
+}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq)
+{
+ int i, r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+
+ if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
+ kvm_is_dm_lowest_prio(irq))
+ printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_is_dm_lowest_prio(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq);
+ } else if (kvm_lapic_enabled(vcpu)) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ }
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq);
+
+ return r;
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level)
+{
+ struct kvm_lapic_irq irq;
+
+ if (!level)
+ return -1;
+
+ trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+
+ irq.dest_id = (e->msi.address_lo &
+ MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+ irq.vector = (e->msi.data &
+ MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+ irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+ irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+ irq.delivery_mode = e->msi.data & 0x700;
+ irq.level = 1;
+ irq.shorthand = 0;
+
+ /* TODO Deal with RH bit of MSI message address */
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
+}
+
+/*
+ * Return value:
+ * < 0 Interrupt was ignored (masked or not delivered for other reasons)
+ * = 0 Interrupt was coalesced (previous irq is still pending)
+ * > 0 Number of CPUs interrupt was delivered to
+ */
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
+{
+ struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
+ int ret = -1, i = 0;
+ struct kvm_irq_routing_table *irq_rt;
+ struct hlist_node *n;
+
+ trace_kvm_set_irq(irq, level, irq_source_id);
+
+ /* Not possible to detect if the guest uses the PIC or the
+ * IOAPIC. So set the bit in both. The guest will ignore
+ * writes to the unused one.
+ */
+ rcu_read_lock();
+ irq_rt = rcu_dereference(kvm->irq_routing);
+ if (irq < irq_rt->nr_rt_entries)
+ hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
+ irq_set[i++] = *e;
+ rcu_read_unlock();
+
+ while(i--) {
+ int r;
+ r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level);
+ if (r < 0)
+ continue;
+
+ ret = r + ((ret < 0) ? 0 : ret);
+ }
+
+ return ret;
+}
+
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+ struct kvm_irq_ack_notifier *kian;
+ struct hlist_node *n;
+ int gsi;
+
+ trace_kvm_ack_irq(irqchip, pin);
+
+ rcu_read_lock();
+ gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list,
+ link)
+ if (kian->gsi == gsi)
+ kian->irq_acked(kian);
+ rcu_read_unlock();
+}
+
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+ struct kvm_irq_ack_notifier *kian)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+ struct kvm_irq_ack_notifier *kian)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_init_rcu(&kian->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_rcu();
+}
+
+int kvm_request_irq_source_id(struct kvm *kvm)
+{
+ unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
+ int irq_source_id;
+
+ mutex_lock(&kvm->irq_lock);
+ irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
+
+ if (irq_source_id >= BITS_PER_LONG) {
+ printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+ irq_source_id = -EFAULT;
+ goto unlock;
+ }
+
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ set_bit(irq_source_id, bitmap);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+
+ return irq_source_id;
+}
+
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
+{
+ int i;
+
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+
+ mutex_lock(&kvm->irq_lock);
+ if (irq_source_id < 0 ||
+ irq_source_id >= BITS_PER_LONG) {
+ printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+ goto unlock;
+ }
+ clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+ if (!irqchip_in_kernel(kvm))
+ goto unlock;
+
+ for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) {
+ clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]);
+ if (i >= 16)
+ continue;
+#ifdef CONFIG_X86
+ clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]);
+#endif
+ }
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &kvm->mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_rcu();
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_irq_mask_notifier *kimn;
+ struct hlist_node *n;
+ int gsi;
+
+ rcu_read_lock();
+ gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ rcu_read_unlock();
+}
+
+void kvm_free_irq_routing(struct kvm *kvm)
+{
+ /* Called only during vm destruction. Nobody can use the pointer
+ at this stage */
+ kfree(kvm->irq_routing);
+}
+
+static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ int r = -EINVAL;
+ int delta;
+ unsigned max_pin;
+ struct kvm_kernel_irq_routing_entry *ei;
+ struct hlist_node *n;
+
+ /*
+ * Do not allow GSI to be mapped to the same irqchip more than once.
+ * Allow only one to one mapping between GSI and MSI.
+ */
+ hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
+ if (ei->type == KVM_IRQ_ROUTING_MSI ||
+ ue->u.irqchip.irqchip == ei->irqchip.irqchip)
+ return r;
+
+ e->gsi = ue->gsi;
+ e->type = ue->type;
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ delta = 0;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ e->set = kvm_set_pic_irq;
+ max_pin = 16;
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->set = kvm_set_pic_irq;
+ max_pin = 16;
+ delta = 8;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ max_pin = KVM_IOAPIC_NUM_PINS;
+ e->set = kvm_set_ioapic_irq;
+ break;
+ default:
+ goto out;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ e->irqchip.pin = ue->u.irqchip.pin + delta;
+ if (e->irqchip.pin >= max_pin)
+ goto out;
+ rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
+ break;
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+ break;
+ default:
+ goto out;
+ }
+
+ hlist_add_head(&e->link, &rt->map[e->gsi]);
+ r = 0;
+out:
+ return r;
+}
+
+
+int kvm_set_irq_routing(struct kvm *kvm,
+ const struct kvm_irq_routing_entry *ue,
+ unsigned nr,
+ unsigned flags)
+{
+ struct kvm_irq_routing_table *new, *old;
+ u32 i, j, nr_rt_entries = 0;
+ int r;
+
+ for (i = 0; i < nr; ++i) {
+ if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
+ return -EINVAL;
+ nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+ }
+
+ nr_rt_entries += 1;
+
+ new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
+ + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
+ GFP_KERNEL);
+
+ if (!new)
+ return -ENOMEM;
+
+ new->rt_entries = (void *)&new->map[nr_rt_entries];
+
+ new->nr_rt_entries = nr_rt_entries;
+ for (i = 0; i < 3; i++)
+ for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++)
+ new->chip[i][j] = -1;
+
+ for (i = 0; i < nr; ++i) {
+ r = -EINVAL;
+ if (ue->flags)
+ goto out;
+ r = setup_routing_entry(new, &new->rt_entries[i], ue);
+ if (r)
+ goto out;
+ ++ue;
+ }
+
+ mutex_lock(&kvm->irq_lock);
+ old = kvm->irq_routing;
+ kvm_irq_routing_update(kvm, new);
+ mutex_unlock(&kvm->irq_lock);
+
+ synchronize_rcu();
+
+ new = old;
+ r = 0;
+
+out:
+ kfree(new);
+ return r;
+}
+
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#ifdef CONFIG_X86
+# define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip.irqchip = SELECT_PIC(irq), .u.irqchip.pin = (irq) % 8 }
+# define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+#else
+# define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq)
+#endif
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+#ifdef CONFIG_IA64
+ ROUTING_ENTRY1(24), ROUTING_ENTRY1(25),
+ ROUTING_ENTRY1(26), ROUTING_ENTRY1(27),
+ ROUTING_ENTRY1(28), ROUTING_ENTRY1(29),
+ ROUTING_ENTRY1(30), ROUTING_ENTRY1(31),
+ ROUTING_ENTRY1(32), ROUTING_ENTRY1(33),
+ ROUTING_ENTRY1(34), ROUTING_ENTRY1(35),
+ ROUTING_ENTRY1(36), ROUTING_ENTRY1(37),
+ ROUTING_ENTRY1(38), ROUTING_ENTRY1(39),
+ ROUTING_ENTRY1(40), ROUTING_ENTRY1(41),
+ ROUTING_ENTRY1(42), ROUTING_ENTRY1(43),
+ ROUTING_ENTRY1(44), ROUTING_ENTRY1(45),
+ ROUTING_ENTRY1(46), ROUTING_ENTRY1(47),
+#endif
+};
+
+int kvm_setup_default_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
diff --git a/linux/x86/kvm_cache_regs.h b/linux/x86/kvm_cache_regs.h
new file mode 100644
index 0000000..3377d53
--- /dev/null
+++ b/linux/x86/kvm_cache_regs.h
@@ -0,0 +1,109 @@
+#ifndef ASM_KVM_CACHE_REGS_H
+#define ASM_KVM_CACHE_REGS_H
+
+#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR4_GUEST_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, reg);
+
+ return vcpu->arch.regs[reg];
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ vcpu->arch.regs[reg] = val;
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
+{
+ might_sleep(); /* on svm */
+
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
+
+ return vcpu->arch.walk_mmu->pdptrs[index];
+}
+
+static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
+{
+ load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
+
+ return mmu->pdptrs[index];
+}
+
+static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+ if (tmask & vcpu->arch.cr0_guest_owned_bits)
+ kvm_x86_ops->decache_cr0_guest_bits(vcpu);
+ return vcpu->arch.cr0 & mask;
+}
+
+static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, ~0UL);
+}
+
+static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
+ if (tmask & vcpu->arch.cr4_guest_owned_bits)
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+ return vcpu->arch.cr4 & mask;
+}
+
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->decache_cr3(vcpu);
+ return vcpu->arch.cr3;
+}
+
+static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+ return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+ | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags |= HF_GUEST_MASK;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags &= ~HF_GUEST_MASK;
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
+#endif
diff --git a/linux/x86/kvm_main.c b/linux/x86/kvm_main.c
new file mode 100644
index 0000000..355948e
--- /dev/null
+++ b/linux/x86/kvm_main.c
@@ -0,0 +1,2679 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/percpu.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/reboot.h>
+#include <linux/debugfs.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/smp.h>
+#include <linux/anon_inodes.h>
+#include <linux/profile.h>
+#include <linux/kvm_para.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/compat.h>
+#include <linux/srcu.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm-generic/bitops/le.h>
+
+#include "coalesced_mmio.h"
+#include "async_pf.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/kvm.h>
+
+MODULE_INFO(version, "kvm-kmod-2.6.38-rc7");
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+/*
+ * Ordering of locks:
+ *
+ * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
+ */
+
+DEFINE_SPINLOCK(kvm_lock);
+LIST_HEAD(vm_list);
+
+static cpumask_var_t cpus_hardware_enabled;
+static int kvm_usage_count = 0;
+static atomic_t hardware_enable_failed;
+
+struct kmem_cache *kvm_vcpu_cache;
+EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+
+static __read_mostly struct preempt_ops kvm_preempt_ops;
+
+struct dentry *kvm_debugfs_dir;
+
+static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
+ unsigned long arg);
+static int hardware_enable_all(void);
+static void hardware_disable_all(void);
+
+static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+
+bool kvm_rebooting;
+EXPORT_SYMBOL_GPL(kvm_rebooting);
+
+static bool largepages_enabled = true;
+
+static struct page *hwpoison_page;
+static pfn_t hwpoison_pfn;
+
+static struct page *fault_page;
+static pfn_t fault_pfn;
+
+inline int kvm_is_mmio_pfn(pfn_t pfn)
+{
+ if (pfn_valid(pfn)) {
+ int reserved;
+ struct page *tail = pfn_to_page(pfn);
+ struct page *head = compound_trans_head(tail);
+ reserved = PageReserved(head);
+ if (head != tail) {
+ /*
+ * "head" is not a dangling pointer
+ * (compound_trans_head takes care of that)
+ * but the hugepage may have been splitted
+ * from under us (and we may not hold a
+ * reference count on the head page so it can
+ * be reused before we run PageReferenced), so
+ * we've to check PageTail before returning
+ * what we just read.
+ */
+ smp_rmb();
+ if (PageTail(tail))
+ return reserved;
+ }
+ return PageReserved(tail);
+ }
+
+ return true;
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put()
+ */
+void vcpu_load(struct kvm_vcpu *vcpu)
+{
+ int cpu;
+
+ mutex_lock(&vcpu->mutex);
+ cpu = get_cpu();
+ preempt_notifier_register(&vcpu->preempt_notifier);
+ kvm_arch_vcpu_load(vcpu, cpu);
+ put_cpu();
+}
+
+void vcpu_put(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ kvm_arch_vcpu_put(vcpu);
+ kvm_fire_urn();
+ preempt_notifier_unregister(&vcpu->preempt_notifier);
+ preempt_enable();
+ mutex_unlock(&vcpu->mutex);
+}
+
+static void ack_flush(void *_completed)
+{
+}
+
+static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
+{
+ int i, cpu, me;
+ cpumask_var_t cpus;
+ bool called = true;
+ struct kvm_vcpu *vcpu;
+
+ zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+
+ raw_spin_lock(&kvm->requests_lock);
+ me = smp_processor_id();
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (kvm_make_check_request(req, vcpu))
+ continue;
+ cpu = vcpu->cpu;
+ if (cpus != NULL && cpu != -1 && cpu != me)
+ cpumask_set_cpu(cpu, cpus);
+ }
+ if (unlikely(cpus == NULL))
+ smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
+ else if (!cpumask_empty(cpus))
+ smp_call_function_many(cpus, ack_flush, NULL, 1);
+ else
+ called = false;
+ raw_spin_unlock(&kvm->requests_lock);
+ free_cpumask_var(cpus);
+ return called;
+}
+
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+ int dirty_count = kvm->tlbs_dirty;
+
+ smp_mb();
+ if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+ ++kvm->stat.remote_tlb_flush;
+ cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
+}
+
+void kvm_reload_remote_mmus(struct kvm *kvm)
+{
+ make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+}
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+{
+ struct page *page;
+ int r;
+
+ mutex_init(&vcpu->mutex);
+ vcpu->cpu = -1;
+ vcpu->kvm = kvm;
+ vcpu->vcpu_id = id;
+ init_waitqueue_head(&vcpu->wq);
+ kvm_async_pf_vcpu_init(vcpu);
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page) {
+ r = -ENOMEM;
+ goto fail;
+ }
+ vcpu->run = page_address(page);
+
+ r = kvm_arch_vcpu_init(vcpu);
+ if (r < 0)
+ goto fail_free_run;
+ return 0;
+
+fail_free_run:
+ free_page((unsigned long)vcpu->run);
+fail:
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_init);
+
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ kvm_arch_vcpu_uninit(vcpu);
+ free_page((unsigned long)vcpu->run);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
+
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+ return container_of(mn, struct kvm, mmu_notifier);
+}
+
+static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int need_tlb_flush, idx;
+
+ /*
+ * When ->invalidate_page runs, the linux pte has been zapped
+ * already but the page is still allocated until
+ * ->invalidate_page returns. So if we increase the sequence
+ * here the kvm page fault will notice if the spte can't be
+ * established because the page is going to be freed. If
+ * instead the kvm page fault establishes the spte before
+ * ->invalidate_page runs, kvm_unmap_hva will release it
+ * before returning.
+ *
+ * The sequence increase only need to be seen at spin_unlock
+ * time, and not at spin_lock time.
+ *
+ * Increasing the sequence after the spin_unlock would be
+ * unsafe because the kvm page fault could then establish the
+ * pte after kvm_unmap_hva returned, without noticing the page
+ * is going to be freed.
+ */
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+ kvm->mmu_notifier_seq++;
+ need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ /* we've to flush the tlb before the pages can be freed */
+ if (need_tlb_flush)
+ kvm_flush_remote_tlbs(kvm);
+
+}
+
+#ifdef MMU_NOTIFIER_HAS_CHANGE_PTE
+static
+#endif
+void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address,
+ pte_t pte)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+ kvm->mmu_notifier_seq++;
+ kvm_set_spte_hva(kvm, address, pte);
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+}
+
+static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int need_tlb_flush = 0, idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+ /*
+ * The count increase must become visible at unlock time as no
+ * spte can be established without taking the mmu_lock and
+ * count is also read inside the mmu_lock critical section.
+ */
+ kvm->mmu_notifier_count++;
+ for (; start < end; start += PAGE_SIZE)
+ need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ need_tlb_flush |= kvm->tlbs_dirty;
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ /* we've to flush the tlb before the pages can be freed */
+ if (need_tlb_flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+ spin_lock(&kvm->mmu_lock);
+ /*
+ * This sequence increase will notify the kvm page fault that
+ * the page that is going to be mapped in the spte could have
+ * been freed.
+ */
+ kvm->mmu_notifier_seq++;
+ /*
+ * The above sequence increase must be visible before the
+ * below count decrease but both values are read by the kvm
+ * page fault under mmu_lock spinlock so we don't need to add
+ * a smb_wmb() here in between the two.
+ */
+ kvm->mmu_notifier_count--;
+ spin_unlock(&kvm->mmu_lock);
+
+ BUG_ON(kvm->mmu_notifier_count < 0);
+}
+
+static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int young, idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+ young = kvm_age_hva(kvm, address);
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (young)
+ kvm_flush_remote_tlbs(kvm);
+
+ return young;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38)
+static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int young, idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+ young = kvm_test_age_hva(kvm, address);
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return young;
+}
+#endif
+
+static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ kvm_arch_flush_shadow(kvm);
+ srcu_read_unlock(&kvm->srcu, idx);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+ .invalidate_page = kvm_mmu_notifier_invalidate_page,
+ .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
+ .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
+ .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38)
+ .test_young = kvm_mmu_notifier_test_young,
+#endif
+#ifdef MMU_NOTIFIER_HAS_CHANGE_PTE
+ .change_pte = kvm_mmu_notifier_change_pte,
+#endif
+ .release = kvm_mmu_notifier_release,
+};
+
+static int kvm_init_mmu_notifier(struct kvm *kvm)
+{
+ kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
+}
+
+#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
+
+static int kvm_init_mmu_notifier(struct kvm *kvm)
+{
+ return 0;
+}
+
+#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+
+static struct kvm *kvm_create_vm(void)
+{
+ int r, i;
+ struct kvm *kvm = kvm_arch_alloc_vm();
+
+ if (!kvm)
+ return ERR_PTR(-ENOMEM);
+
+ r = kvm_arch_init_vm(kvm);
+ if (r)
+ goto out_err_nodisable;
+
+ r = hardware_enable_all();
+ if (r)
+ goto out_err_nodisable;
+
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+ INIT_HLIST_HEAD(&kvm->mask_notifier_list);
+ INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
+#endif
+
+ r = -ENOMEM;
+ kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!kvm->memslots)
+ goto out_err_nosrcu;
+ if (init_srcu_struct(&kvm->srcu))
+ goto out_err_nosrcu;
+ for (i = 0; i < KVM_NR_BUSES; i++) {
+ kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
+ GFP_KERNEL);
+ if (!kvm->buses[i])
+ goto out_err;
+ }
+
+ r = kvm_init_mmu_notifier(kvm);
+ if (r)
+ goto out_err;
+
+ kvm->mm = current->mm;
+ mmget(&kvm->mm->mm_count);
+ spin_lock_init(&kvm->mmu_lock);
+ raw_spin_lock_init(&kvm->requests_lock);
+ kvm_eventfd_init(kvm);
+ mutex_init(&kvm->lock);
+ mutex_init(&kvm->irq_lock);
+ mutex_init(&kvm->slots_lock);
+ atomic_set(&kvm->users_count, 1);
+ spin_lock(&kvm_lock);
+ list_add(&kvm->vm_list, &vm_list);
+ spin_unlock(&kvm_lock);
+
+ return kvm;
+
+out_err:
+ cleanup_srcu_struct(&kvm->srcu);
+out_err_nosrcu:
+ hardware_disable_all();
+out_err_nodisable:
+ for (i = 0; i < KVM_NR_BUSES; i++)
+ kfree(kvm->buses[i]);
+ kfree(kvm->memslots);
+ kvm_arch_free_vm(kvm);
+ return ERR_PTR(r);
+}
+
+static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
+{
+ if (!memslot->dirty_bitmap)
+ return;
+
+ if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
+ vfree(memslot->dirty_bitmap_head);
+ else
+ kfree(memslot->dirty_bitmap_head);
+
+ memslot->dirty_bitmap = NULL;
+ memslot->dirty_bitmap_head = NULL;
+}
+
+/*
+ * Free any memory in @free but not in @dont.
+ */
+static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ int i;
+
+ if (!dont || free->rmap != dont->rmap)
+ vfree(free->rmap);
+
+ if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
+ kvm_destroy_dirty_bitmap(free);
+
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
+ vfree(free->lpage_info[i]);
+ free->lpage_info[i] = NULL;
+ }
+ }
+
+ free->npages = 0;
+ free->rmap = NULL;
+}
+
+void kvm_free_physmem(struct kvm *kvm)
+{
+ int i;
+ struct kvm_memslots *slots = kvm->memslots;
+
+ for (i = 0; i < slots->nmemslots; ++i)
+ kvm_free_physmem_slot(&slots->memslots[i], NULL);
+
+ kfree(kvm->memslots);
+}
+
+static void kvm_destroy_vm(struct kvm *kvm)
+{
+ int i;
+ struct mm_struct *mm = kvm->mm;
+
+ kvm_arch_sync_events(kvm);
+ spin_lock(&kvm_lock);
+ list_del(&kvm->vm_list);
+ spin_unlock(&kvm_lock);
+ kvm_free_irq_routing(kvm);
+ for (i = 0; i < KVM_NR_BUSES; i++)
+ kvm_io_bus_destroy(kvm->buses[i]);
+ kvm_coalesced_mmio_free(kvm);
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+ mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+#else
+ kvm_arch_flush_shadow(kvm);
+#endif
+ kvm_arch_destroy_vm(kvm);
+ kvm_free_physmem(kvm);
+ cleanup_srcu_struct(&kvm->srcu);
+ kvm_arch_free_vm(kvm);
+ hardware_disable_all();
+ mmdrop(mm);
+}
+
+void kvm_get_kvm(struct kvm *kvm)
+{
+ atomic_inc(&kvm->users_count);
+}
+EXPORT_SYMBOL_GPL(kvm_get_kvm);
+
+void kvm_put_kvm(struct kvm *kvm)
+{
+ if (atomic_dec_and_test(&kvm->users_count))
+ kvm_destroy_vm(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_put_kvm);
+
+
+static int kvm_vm_release(struct inode *inode, struct file *filp)
+{
+ struct kvm *kvm = filp->private_data;
+
+ kvm_irqfd_release(kvm);
+
+ kvm_put_kvm(kvm);
+ return 0;
+}
+
+/*
+ * Allocation size is twice as large as the actual dirty bitmap size.
+ * This makes it possible to do double buffering: see x86's
+ * kvm_vm_ioctl_get_dirty_log().
+ */
+static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
+{
+ unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
+
+ if (dirty_bytes > PAGE_SIZE)
+ memslot->dirty_bitmap = vzalloc(dirty_bytes);
+ else
+ memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
+
+ if (!memslot->dirty_bitmap)
+ return -ENOMEM;
+
+ memslot->dirty_bitmap_head = memslot->dirty_bitmap;
+ return 0;
+}
+
+/*
+ * Allocate some memory and give it an address in the guest physical address
+ * space.
+ *
+ * Discontiguous memory is allowed, mostly for framebuffers.
+ *
+ * Must be called holding mmap_sem for write.
+ */
+int __kvm_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc)
+{
+ int r, flush_shadow = 0;
+ gfn_t base_gfn;
+ unsigned long npages;
+ unsigned long i;
+ struct kvm_memory_slot *memslot;
+ struct kvm_memory_slot old, new;
+ struct kvm_memslots *slots, *old_memslots;
+
+ r = -EINVAL;
+ /* General sanity checks */
+ if (mem->memory_size & (PAGE_SIZE - 1))
+ goto out;
+ if (mem->guest_phys_addr & (PAGE_SIZE - 1))
+ goto out;
+ if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
+ goto out;
+ if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+ goto out;
+ if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
+ goto out;
+
+ memslot = &kvm->memslots->memslots[mem->slot];
+ base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
+ npages = mem->memory_size >> PAGE_SHIFT;
+
+ r = -EINVAL;
+ if (npages > KVM_MEM_MAX_NR_PAGES)
+ goto out;
+
+ if (!npages)
+ mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
+ new = old = *memslot;
+
+ new.id = mem->slot;
+ new.base_gfn = base_gfn;
+ new.npages = npages;
+ new.flags = mem->flags;
+
+ /* Disallow changing a memory slot's size. */
+ r = -EINVAL;
+ if (npages && old.npages && npages != old.npages)
+ goto out_free;
+
+ /* Check for overlaps */
+ r = -EEXIST;
+ for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+ struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
+
+ if (s == memslot || !s->npages)
+ continue;
+ if (!((base_gfn + npages <= s->base_gfn) ||
+ (base_gfn >= s->base_gfn + s->npages)))
+ goto out_free;
+ }
+
+ /* Free page dirty bitmap if unneeded */
+ if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
+ new.dirty_bitmap = NULL;
+
+ r = -ENOMEM;
+
+ /* Allocate if a slot is being created */
+#ifndef CONFIG_S390
+ if (npages && !new.rmap) {
+ new.rmap = vzalloc(npages * sizeof(*new.rmap));
+
+ if (!new.rmap)
+ goto out_free;
+
+ new.user_alloc = user_alloc;
+ new.userspace_addr = mem->userspace_addr;
+ }
+ if (!npages)
+ goto skip_lpage;
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ unsigned long ugfn;
+ unsigned long j;
+ int lpages;
+ int level = i + 2;
+
+ /* Avoid unused variable warning if no large pages */
+ (void)level;
+
+ if (new.lpage_info[i])
+ continue;
+
+ lpages = 1 + ((base_gfn + npages - 1)
+ >> KVM_HPAGE_GFN_SHIFT(level));
+ lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
+
+ new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
+
+ if (!new.lpage_info[i])
+ goto out_free;
+
+ if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+ new.lpage_info[i][0].write_count = 1;
+ if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+ new.lpage_info[i][lpages - 1].write_count = 1;
+ ugfn = new.userspace_addr >> PAGE_SHIFT;
+ /*
+ * If the gfn and userspace address are not aligned wrt each
+ * other, or if explicitly asked to, disable large page
+ * support for this slot
+ */
+ if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+ !largepages_enabled)
+ for (j = 0; j < lpages; ++j)
+ new.lpage_info[i][j].write_count = 1;
+ }
+
+skip_lpage:
+
+ /* Allocate page dirty bitmap if needed */
+ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
+ if (kvm_create_dirty_bitmap(&new) < 0)
+ goto out_free;
+ /* destroy any largepage mappings for dirty tracking */
+ if (old.npages)
+ flush_shadow = 1;
+ }
+#else /* not defined CONFIG_S390 */
+ new.user_alloc = user_alloc;
+ if (user_alloc)
+ new.userspace_addr = mem->userspace_addr;
+#endif /* not defined CONFIG_S390 */
+
+ if (!npages) {
+ r = -ENOMEM;
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots)
+ goto out_free;
+ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+ if (mem->slot >= slots->nmemslots)
+ slots->nmemslots = mem->slot + 1;
+ slots->generation++;
+ slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
+
+ old_memslots = kvm->memslots;
+ rcu_assign_pointer(kvm->memslots, slots);
+ kvm_synchronize_srcu_expedited(&kvm->srcu);
+ /* From this point no new shadow pages pointing to a deleted
+ * memslot will be created.
+ *
+ * validation of sp->gfn happens in:
+ * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
+ * - kvm_is_visible_gfn (mmu_check_roots)
+ */
+ kvm_arch_flush_shadow(kvm);
+ kfree(old_memslots);
+ }
+
+ r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
+ if (r)
+ goto out_free;
+
+ /* map the pages in iommu page table */
+ if (npages) {
+ r = kvm_iommu_map_pages(kvm, &new);
+ if (r)
+ goto out_free;
+ }
+
+ r = -ENOMEM;
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots)
+ goto out_free;
+ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+ if (mem->slot >= slots->nmemslots)
+ slots->nmemslots = mem->slot + 1;
+ slots->generation++;
+
+ /* actual memory is freed via old in kvm_free_physmem_slot below */
+ if (!npages) {
+ new.rmap = NULL;
+ new.dirty_bitmap = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
+ new.lpage_info[i] = NULL;
+ }
+
+ slots->memslots[mem->slot] = new;
+ old_memslots = kvm->memslots;
+ rcu_assign_pointer(kvm->memslots, slots);
+ kvm_synchronize_srcu_expedited(&kvm->srcu);
+
+ kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
+
+ kvm_free_physmem_slot(&old, &new);
+ kfree(old_memslots);
+
+ if (flush_shadow)
+ kvm_arch_flush_shadow(kvm);
+
+ return 0;
+
+out_free:
+ kvm_free_physmem_slot(&new, &old);
+out:
+ return r;
+
+}
+EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
+
+int kvm_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc)
+{
+ int r;
+
+ mutex_lock(&kvm->slots_lock);
+ r = __kvm_set_memory_region(kvm, mem, user_alloc);
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_set_memory_region);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+ struct
+ kvm_userspace_memory_region *mem,
+ int user_alloc)
+{
+ if (mem->slot >= KVM_MEMORY_SLOTS)
+ return -EINVAL;
+ return kvm_set_memory_region(kvm, mem, user_alloc);
+}
+
+int kvm_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log, int *is_dirty)
+{
+ struct kvm_memory_slot *memslot;
+ int r, i;
+ unsigned long n;
+ unsigned long any = 0;
+
+ r = -EINVAL;
+ if (log->slot >= KVM_MEMORY_SLOTS)
+ goto out;
+
+ memslot = &kvm->memslots->memslots[log->slot];
+ r = -ENOENT;
+ if (!memslot->dirty_bitmap)
+ goto out;
+
+ n = kvm_dirty_bitmap_bytes(memslot);
+
+ for (i = 0; !any && i < n/sizeof(long); ++i)
+ any = memslot->dirty_bitmap[i];
+
+ r = -EFAULT;
+ if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+ goto out;
+
+ if (any)
+ *is_dirty = 1;
+
+ r = 0;
+out:
+ return r;
+}
+
+void kvm_disable_largepages(void)
+{
+ largepages_enabled = false;
+}
+EXPORT_SYMBOL_GPL(kvm_disable_largepages);
+
+int is_error_page(struct page *page)
+{
+ return page == bad_page || page == hwpoison_page || page == fault_page;
+}
+EXPORT_SYMBOL_GPL(is_error_page);
+
+int is_error_pfn(pfn_t pfn)
+{
+ return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_error_pfn);
+
+int is_hwpoison_pfn(pfn_t pfn)
+{
+ return pfn == hwpoison_pfn;
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
+
+int is_fault_pfn(pfn_t pfn)
+{
+ return pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_fault_pfn);
+
+static inline unsigned long bad_hva(void)
+{
+ return PAGE_OFFSET;
+}
+
+int kvm_is_error_hva(unsigned long addr)
+{
+ return addr == bad_hva();
+}
+EXPORT_SYMBOL_GPL(kvm_is_error_hva);
+
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
+ gfn_t gfn)
+{
+ int i;
+
+ for (i = 0; i < slots->nmemslots; ++i) {
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
+
+ if (gfn >= memslot->base_gfn
+ && gfn < memslot->base_gfn + memslot->npages)
+ return memslot;
+ }
+ return NULL;
+}
+
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_memslot(kvm_memslots(kvm), gfn);
+}
+EXPORT_SYMBOL_GPL(gfn_to_memslot);
+
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+
+ for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
+
+ if (memslot->flags & KVM_MEMSLOT_INVALID)
+ continue;
+
+ if (gfn >= memslot->base_gfn
+ && gfn < memslot->base_gfn + memslot->npages)
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
+
+unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr, size;
+
+ size = PAGE_SIZE;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return PAGE_SIZE;
+
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, addr);
+ if (!vma)
+ goto out;
+
+ size = kvm_vma_kernel_pagesize(vma);
+
+out:
+ up_read(&current->mm->mmap_sem);
+
+ return size;
+}
+
+int memslot_id(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *memslot = NULL;
+
+ for (i = 0; i < slots->nmemslots; ++i) {
+ memslot = &slots->memslots[i];
+
+ if (gfn >= memslot->base_gfn
+ && gfn < memslot->base_gfn + memslot->npages)
+ break;
+ }
+
+ return memslot - slots->memslots;
+}
+
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages)
+{
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return bad_hva();
+
+ if (nr_pages)
+ *nr_pages = slot->npages - (gfn - slot->base_gfn);
+
+ return gfn_to_hva_memslot(slot, gfn);
+}
+
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+ return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_hva);
+
+static pfn_t get_fault_pfn(void)
+{
+ get_page(fault_page);
+ return fault_pfn;
+}
+
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
+ bool *async, bool write_fault, bool *writable)
+{
+ struct page *page[1];
+ int npages = 0;
+ pfn_t pfn;
+
+ /* we can do it either atomically or asynchronously, not both */
+ BUG_ON(atomic && async);
+
+ BUG_ON(!write_fault && !writable);
+
+ if (writable)
+ *writable = true;
+
+ if (atomic || async)
+ npages = kvm___get_user_pages_fast(addr, 1, 1, page);
+
+ if (unlikely(npages != 1) && !atomic) {
+ might_sleep();
+
+ if (writable)
+ *writable = write_fault;
+
+ npages = get_user_pages_fast(addr, 1, write_fault, page);
+
+ /* map read fault as writable if possible */
+ if (unlikely(!write_fault) && npages == 1) {
+ struct page *wpage[1];
+
+ npages = kvm___get_user_pages_fast(addr, 1, 1, wpage);
+ if (npages == 1) {
+ *writable = true;
+ put_page(page[0]);
+ page[0] = wpage[0];
+ }
+ npages = 1;
+ }
+ }
+
+ if (unlikely(npages != 1)) {
+ struct vm_area_struct *vma;
+
+ if (atomic)
+ return get_fault_pfn();
+
+ down_read(&current->mm->mmap_sem);
+ if (is_hwpoison_address(addr)) {
+ up_read(&current->mm->mmap_sem);
+ get_page(hwpoison_page);
+ return page_to_pfn(hwpoison_page);
+ }
+
+ vma = find_vma_intersection(current->mm, addr, addr+1);
+
+ if (vma == NULL)
+ pfn = get_fault_pfn();
+ else if ((vma->vm_flags & VM_PFNMAP)) {
+ pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ BUG_ON(!kvm_is_mmio_pfn(pfn));
+ } else {
+ if (async && (vma->vm_flags & VM_WRITE))
+ *async = true;
+ pfn = get_fault_pfn();
+ }
+ up_read(&current->mm->mmap_sem);
+ } else
+ pfn = page_to_pfn(page[0]);
+
+ return pfn;
+}
+
+pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+{
+ return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+}
+EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
+
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
+ bool write_fault, bool *writable)
+{
+ unsigned long addr;
+
+ if (async)
+ *async = false;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr)) {
+ get_page(bad_page);
+ return page_to_pfn(bad_page);
+ }
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+ async = NULL;
+#endif
+ return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+}
+
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
+ bool write_fault, bool *writable)
+{
+ return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn);
+
+pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+ bool *writable)
+{
+ return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
+
+pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ unsigned long addr = gfn_to_hva_memslot(slot, gfn);
+ return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+}
+
+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
+ int nr_pages)
+{
+ unsigned long addr;
+ gfn_t entry;
+
+ addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
+ if (kvm_is_error_hva(addr))
+ return -1;
+
+ if (entry < nr_pages)
+ return 0;
+
+ return kvm___get_user_pages_fast(addr, nr_pages, 1, pages);
+}
+EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+ pfn_t pfn;
+
+ pfn = gfn_to_pfn(kvm, gfn);
+ if (!kvm_is_mmio_pfn(pfn))
+ return pfn_to_page(pfn);
+
+ WARN_ON(kvm_is_mmio_pfn(pfn));
+
+ get_page(bad_page);
+ return bad_page;
+}
+
+EXPORT_SYMBOL_GPL(gfn_to_page);
+
+void kvm_release_page_clean(struct page *page)
+{
+ kvm_release_pfn_clean(page_to_pfn(page));
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_clean);
+
+void kvm_release_pfn_clean(pfn_t pfn)
+{
+ if (!kvm_is_mmio_pfn(pfn))
+ put_page(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
+
+void kvm_release_page_dirty(struct page *page)
+{
+ kvm_release_pfn_dirty(page_to_pfn(page));
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
+
+void kvm_release_pfn_dirty(pfn_t pfn)
+{
+ kvm_set_pfn_dirty(pfn);
+ kvm_release_pfn_clean(pfn);
+}
+EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
+
+void kvm_set_page_dirty(struct page *page)
+{
+ kvm_set_pfn_dirty(page_to_pfn(page));
+}
+EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
+
+void kvm_set_pfn_dirty(pfn_t pfn)
+{
+ if (!kvm_is_mmio_pfn(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+ if (!PageReserved(page))
+ SetPageDirty(page);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
+
+void kvm_set_pfn_accessed(pfn_t pfn)
+{
+ if (!kvm_is_mmio_pfn(pfn))
+ mark_page_accessed(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
+
+void kvm_get_pfn(pfn_t pfn)
+{
+ if (!kvm_is_mmio_pfn(pfn))
+ get_page(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_get_pfn);
+
+static int next_segment(unsigned long len, int offset)
+{
+ if (len > PAGE_SIZE - offset)
+ return PAGE_SIZE - offset;
+ else
+ return len;
+}
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+ int len)
+{
+ int r;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ r = copy_from_user(data, (void *)addr + offset, len);
+ if (r)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page);
+
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ ++gfn;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest);
+
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len)
+{
+ int r;
+ unsigned long addr;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int offset = offset_in_page(gpa);
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ pagefault_disable();
+ r = __copy_from_user_inatomic(data, (void *)addr + offset, len);
+ pagefault_enable();
+ if (r)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL(kvm_read_guest_atomic);
+
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+ int offset, int len)
+{
+ int r;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ r = copy_to_user((void *)addr + offset, data, len);
+ if (r)
+ return -EFAULT;
+ mark_page_dirty(kvm, gfn);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_page);
+
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+ unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ ++gfn;
+ }
+ return 0;
+}
+
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+ gpa_t gpa)
+{
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ int offset = offset_in_page(gpa);
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+
+ ghc->gpa = gpa;
+ ghc->generation = slots->generation;
+ ghc->memslot = __gfn_to_memslot(slots, gfn);
+ ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
+ if (!kvm_is_error_hva(ghc->hva))
+ ghc->hva += offset;
+ else
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
+
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len)
+{
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ int r;
+
+ if (slots->generation != ghc->generation)
+ kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+
+ if (kvm_is_error_hva(ghc->hva))
+ return -EFAULT;
+
+ r = copy_to_user((void *)ghc->hva, data, len);
+ if (r)
+ return -EFAULT;
+ mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
+{
+ return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
+ offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
+
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ ++gfn;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_clear_guest);
+
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ gfn_t gfn)
+{
+ if (memslot && memslot->dirty_bitmap) {
+ unsigned long rel_gfn = gfn - memslot->base_gfn;
+
+ generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
+ }
+}
+
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *memslot;
+
+ memslot = gfn_to_memslot(kvm, gfn);
+ mark_page_dirty_in_slot(kvm, memslot, gfn);
+}
+
+/*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+{
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+
+ if (kvm_arch_vcpu_runnable(vcpu)) {
+ kvm_make_request(KVM_REQ_UNHALT, vcpu);
+ break;
+ }
+ if (kvm_cpu_has_pending_timer(vcpu))
+ break;
+ if (signal_pending(current))
+ break;
+
+ schedule();
+ }
+
+ finish_wait(&vcpu->wq, &wait);
+}
+
+void kvm_resched(struct kvm_vcpu *vcpu)
+{
+ if (!need_resched())
+ return;
+ cond_resched();
+}
+EXPORT_SYMBOL_GPL(kvm_resched);
+
+void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
+{
+ ktime_t expires;
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+
+ /* Sleep for 100 us, and hope lock-holder got scheduled */
+ expires = ktime_add_ns(ktime_get(), 100000UL);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+
+ finish_wait(&vcpu->wq, &wait);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
+
+static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct kvm_vcpu *vcpu = vma->vm_file->private_data;
+ struct page *page;
+
+ if (vmf->pgoff == 0)
+ page = virt_to_page(vcpu->run);
+#ifdef CONFIG_X86
+ else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
+ page = virt_to_page(vcpu->arch.pio_data);
+#endif
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+ else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
+ page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
+#endif
+ else
+ return VM_FAULT_SIGBUS;
+ get_page(page);
+ vmf->page = page;
+ return 0;
+}
+
+static struct vm_operations_struct kvm_vcpu_vm_ops = {
+ .fault = kvm_vcpu_fault,
+};
+
+static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &kvm_vcpu_vm_ops;
+ return 0;
+}
+
+static int kvm_vcpu_release(struct inode *inode, struct file *filp)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+
+ kvm_put_kvm(vcpu->kvm);
+ return 0;
+}
+
+static struct file_operations kvm_vcpu_fops = {
+ .release = kvm_vcpu_release,
+ .unlocked_ioctl = kvm_vcpu_ioctl,
+ .compat_ioctl = kvm_vcpu_ioctl,
+ .mmap = kvm_vcpu_mmap,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)
+ .llseek = noop_llseek,
+#endif
+};
+
+/*
+ * Allocates an inode for the vcpu.
+ */
+static int create_vcpu_fd(struct kvm_vcpu *vcpu)
+{
+ return kvm_anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
+}
+
+/*
+ * Creates some virtual cpus. Good luck creating more than one.
+ */
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
+{
+ int r;
+ struct kvm_vcpu *vcpu, *v;
+
+ vcpu = kvm_arch_vcpu_create(kvm, id);
+ if (IS_ERR(vcpu))
+ return PTR_ERR(vcpu);
+
+ preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+
+ r = kvm_arch_vcpu_setup(vcpu);
+ if (r)
+ return r;
+
+ mutex_lock(&kvm->lock);
+ if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
+ r = -EINVAL;
+ goto vcpu_destroy;
+ }
+
+ kvm_for_each_vcpu(r, v, kvm)
+ if (v->vcpu_id == id) {
+ r = -EEXIST;
+ goto vcpu_destroy;
+ }
+
+ BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+
+ /* Now it's all set up, let userspace reach it */
+ kvm_get_kvm(kvm);
+ r = create_vcpu_fd(vcpu);
+ if (r < 0) {
+ kvm_put_kvm(kvm);
+ goto vcpu_destroy;
+ }
+
+ kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
+ smp_wmb();
+ atomic_inc(&kvm->online_vcpus);
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+ if (kvm->bsp_vcpu_id == id)
+ kvm->bsp_vcpu = vcpu;
+#endif
+ mutex_unlock(&kvm->lock);
+ return r;
+
+vcpu_destroy:
+ mutex_unlock(&kvm->lock);
+ kvm_arch_vcpu_destroy(vcpu);
+ return r;
+}
+
+static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
+{
+ if (sigset) {
+ sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ vcpu->sigset_active = 1;
+ vcpu->sigset = *sigset;
+ } else
+ vcpu->sigset_active = 0;
+ return 0;
+}
+
+static long kvm_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void *argp = (void *)arg;
+ int r;
+ struct kvm_fpu *fpu = NULL;
+ struct kvm_sregs *kvm_sregs = NULL;
+
+ if (vcpu->kvm->mm != current->mm)
+ return -EIO;
+
+#if defined(CONFIG_S390) || defined(CONFIG_PPC)
+ /*
+ * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
+ * so vcpu_load() would break it.
+ */
+ if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
+ return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+#endif
+
+
+ vcpu_load(vcpu);
+ switch (ioctl) {
+ case KVM_RUN:
+ r = -EINVAL;
+ if (arg)
+ goto out;
+ r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+ trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
+ break;
+ case KVM_GET_REGS: {
+ struct kvm_regs *kvm_regs;
+
+ r = -ENOMEM;
+ kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
+ if (!kvm_regs)
+ goto out;
+ r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
+ if (r)
+ goto out_free1;
+ r = -EFAULT;
+ if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
+ goto out_free1;
+ r = 0;
+out_free1:
+ kfree(kvm_regs);
+ break;
+ }
+ case KVM_SET_REGS: {
+ struct kvm_regs *kvm_regs;
+
+ r = -ENOMEM;
+ kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
+ if (!kvm_regs)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
+ goto out_free2;
+ r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
+ if (r)
+ goto out_free2;
+ r = 0;
+out_free2:
+ kfree(kvm_regs);
+ break;
+ }
+ case KVM_GET_SREGS: {
+ kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!kvm_sregs)
+ goto out;
+ r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_SREGS: {
+ kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!kvm_sregs)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
+ goto out;
+ r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_MP_STATE: {
+ struct kvm_mp_state mp_state;
+
+ r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &mp_state, sizeof mp_state))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_MP_STATE: {
+ struct kvm_mp_state mp_state;
+
+ r = -EFAULT;
+ if (copy_from_user(&mp_state, argp, sizeof mp_state))
+ goto out;
+ r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_TRANSLATE: {
+ struct kvm_translation tr;
+
+ r = -EFAULT;
+ if (copy_from_user(&tr, argp, sizeof tr))
+ goto out;
+ r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &tr, sizeof tr))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_GUEST_DEBUG: {
+ struct kvm_guest_debug dbg;
+
+ r = -EFAULT;
+ if (copy_from_user(&dbg, argp, sizeof dbg))
+ goto out;
+ r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_SIGNAL_MASK: {
+ struct kvm_signal_mask *sigmask_arg = argp;
+ struct kvm_signal_mask kvm_sigmask;
+ sigset_t sigset, *p;
+
+ p = NULL;
+ if (argp) {
+ r = -EFAULT;
+ if (copy_from_user(&kvm_sigmask, argp,
+ sizeof kvm_sigmask))
+ goto out;
+ r = -EINVAL;
+ if (kvm_sigmask.len != sizeof sigset)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&sigset, sigmask_arg->sigset,
+ sizeof sigset))
+ goto out;
+ p = &sigset;
+ }
+ r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
+ break;
+ }
+ case KVM_GET_FPU: {
+ fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!fpu)
+ goto out;
+ r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_FPU: {
+ fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!fpu)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
+ goto out;
+ r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ default:
+ r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+ }
+out:
+ vcpu_put(vcpu);
+ kfree(fpu);
+ kfree(kvm_sregs);
+ return r;
+}
+
+static long kvm_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm *kvm = filp->private_data;
+ void *argp = (void *)arg;
+ int r;
+
+ if (kvm->mm != current->mm)
+ return -EIO;
+ switch (ioctl) {
+ case KVM_CREATE_VCPU:
+ r = kvm_vm_ioctl_create_vcpu(kvm, arg);
+ if (r < 0)
+ goto out;
+ break;
+ case KVM_SET_USER_MEMORY_REGION: {
+ struct kvm_userspace_memory_region kvm_userspace_mem;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_userspace_mem, argp,
+ sizeof kvm_userspace_mem))
+ goto out;
+
+ r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_GET_DIRTY_LOG: {
+ struct kvm_dirty_log log;
+
+ r = -EFAULT;
+ if (copy_from_user(&log, argp, sizeof log))
+ goto out;
+ r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+ if (r)
+ goto out;
+ break;
+ }
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+ case KVM_REGISTER_COALESCED_MMIO: {
+ struct kvm_coalesced_mmio_zone zone;
+ r = -EFAULT;
+ if (copy_from_user(&zone, argp, sizeof zone))
+ goto out;
+ r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_UNREGISTER_COALESCED_MMIO: {
+ struct kvm_coalesced_mmio_zone zone;
+ r = -EFAULT;
+ if (copy_from_user(&zone, argp, sizeof zone))
+ goto out;
+ r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)
+ case KVM_IRQFD: {
+ struct kvm_irqfd data;
+
+ r = -EFAULT;
+ if (copy_from_user(&data, argp, sizeof data))
+ goto out;
+ r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
+ break;
+ }
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)
+ case KVM_IOEVENTFD: {
+ struct kvm_ioeventfd data;
+
+ r = -EFAULT;
+ if (copy_from_user(&data, argp, sizeof data))
+ goto out;
+ r = kvm_ioeventfd(kvm, &data);
+ break;
+ }
+#endif
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+ case KVM_SET_BOOT_CPU_ID:
+ r = 0;
+ mutex_lock(&kvm->lock);
+ if (atomic_read(&kvm->online_vcpus) != 0)
+ r = -EBUSY;
+ else
+ kvm->bsp_vcpu_id = arg;
+ mutex_unlock(&kvm->lock);
+ break;
+#endif
+ default:
+ r = kvm_arch_vm_ioctl(filp, ioctl, arg);
+ if (r == -ENOTTY)
+ r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
+ }
+out:
+ return r;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_kvm_dirty_log {
+ __u32 slot;
+ __u32 padding1;
+ union {
+ compat_uptr_t dirty_bitmap; /* one bit per page */
+ __u64 padding2;
+ };
+};
+
+static long kvm_vm_compat_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm *kvm = filp->private_data;
+ int r;
+
+ if (kvm->mm != current->mm)
+ return -EIO;
+ switch (ioctl) {
+ case KVM_GET_DIRTY_LOG: {
+ struct compat_kvm_dirty_log compat_log;
+ struct kvm_dirty_log log;
+
+ r = -EFAULT;
+ if (copy_from_user(&compat_log, (void *)arg,
+ sizeof(compat_log)))
+ goto out;
+ log.slot = compat_log.slot;
+ log.padding1 = compat_log.padding1;
+ log.padding2 = compat_log.padding2;
+ log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
+
+ r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+ if (r)
+ goto out;
+ break;
+ }
+ default:
+ r = kvm_vm_ioctl(filp, ioctl, arg);
+ }
+
+out:
+ return r;
+}
+#endif
+
+static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page[1];
+ unsigned long addr;
+ int npages;
+ gfn_t gfn = vmf->pgoff;
+ struct kvm *kvm = vma->vm_file->private_data;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return VM_FAULT_SIGBUS;
+
+ npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
+ NULL);
+ if (unlikely(npages != 1))
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = page[0];
+ return 0;
+}
+
+static struct vm_operations_struct kvm_vm_vm_ops = {
+ .fault = kvm_vm_fault,
+};
+
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &kvm_vm_vm_ops;
+ return 0;
+}
+
+static struct file_operations kvm_vm_fops = {
+ .release = kvm_vm_release,
+ .unlocked_ioctl = kvm_vm_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = kvm_vm_compat_ioctl,
+#endif
+ .mmap = kvm_vm_mmap,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)
+ .llseek = noop_llseek,
+#endif
+};
+
+static int kvm_dev_ioctl_create_vm(void)
+{
+ int r;
+ struct kvm *kvm;
+
+ kvm = kvm_create_vm();
+ if (IS_ERR(kvm))
+ return PTR_ERR(kvm);
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+ r = kvm_coalesced_mmio_init(kvm);
+ if (r < 0) {
+ kvm_put_kvm(kvm);
+ return r;
+ }
+#endif
+ r = kvm_anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+ if (r < 0)
+ kvm_put_kvm(kvm);
+
+ return r;
+}
+
+static long kvm_dev_ioctl_check_extension_generic(long arg)
+{
+ switch (arg) {
+ case KVM_CAP_USER_MEMORY:
+ case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+ case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+ case KVM_CAP_SET_BOOT_CPU_ID:
+#endif
+ case KVM_CAP_INTERNAL_ERROR_DATA:
+ return 1;
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+ case KVM_CAP_IRQ_ROUTING:
+ return KVM_MAX_IRQ_ROUTES;
+#endif
+ default:
+ break;
+ }
+ return kvm_dev_ioctl_check_extension(arg);
+}
+
+static long kvm_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ long r = -EINVAL;
+
+ switch (ioctl) {
+ case KVM_GET_API_VERSION:
+ r = -EINVAL;
+ if (arg)
+ goto out;
+ r = KVM_API_VERSION;
+ break;
+ case KVM_CREATE_VM:
+ r = -EINVAL;
+ if (arg)
+ goto out;
+ r = kvm_dev_ioctl_create_vm();
+ break;
+ case KVM_CHECK_EXTENSION:
+ r = kvm_dev_ioctl_check_extension_generic(arg);
+ break;
+ case KVM_GET_VCPU_MMAP_SIZE:
+ r = -EINVAL;
+ if (arg)
+ goto out;
+ r = PAGE_SIZE; /* struct kvm_run */
+#ifdef CONFIG_X86
+ r += PAGE_SIZE; /* pio data page */
+#endif
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+ r += PAGE_SIZE; /* coalesced mmio ring page */
+#endif
+ break;
+ case KVM_TRACE_ENABLE:
+ case KVM_TRACE_PAUSE:
+ case KVM_TRACE_DISABLE:
+ r = -EOPNOTSUPP;
+ break;
+ default:
+ return kvm_arch_dev_ioctl(filp, ioctl, arg);
+ }
+out:
+ return r;
+}
+
+static struct file_operations kvm_chardev_ops = {
+ .unlocked_ioctl = kvm_dev_ioctl,
+ .compat_ioctl = kvm_dev_ioctl,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)
+ .llseek = noop_llseek,
+#endif
+};
+
+static struct miscdevice kvm_dev = {
+ KVM_MINOR,
+ "kvm",
+ &kvm_chardev_ops,
+};
+
+static void hardware_enable_nolock(void *junk)
+{
+ int cpu = raw_smp_processor_id();
+ int r;
+
+ if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
+ return;
+
+ cpumask_set_cpu(cpu, cpus_hardware_enabled);
+
+ r = kvm_arch_hardware_enable(NULL);
+
+ if (r) {
+ cpumask_clear_cpu(cpu, cpus_hardware_enabled);
+ atomic_inc(&hardware_enable_failed);
+ printk(KERN_INFO "kvm: enabling virtualization on "
+ "CPU%d failed\n", cpu);
+ }
+}
+
+static void hardware_enable(void *junk)
+{
+ spin_lock(&kvm_lock);
+ hardware_enable_nolock(junk);
+ spin_unlock(&kvm_lock);
+}
+
+static void hardware_disable_nolock(void *junk)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
+ return;
+ cpumask_clear_cpu(cpu, cpus_hardware_enabled);
+ kvm_arch_hardware_disable(NULL);
+}
+
+static void hardware_disable(void *junk)
+{
+ spin_lock(&kvm_lock);
+ hardware_disable_nolock(junk);
+ spin_unlock(&kvm_lock);
+}
+
+static void hardware_disable_all_nolock(void)
+{
+ BUG_ON(!kvm_usage_count);
+
+ kvm_usage_count--;
+ if (!kvm_usage_count)
+ kvm_on_each_cpu(hardware_disable_nolock, NULL, 1);
+}
+
+static void hardware_disable_all(void)
+{
+ spin_lock(&kvm_lock);
+ hardware_disable_all_nolock();
+ spin_unlock(&kvm_lock);
+}
+
+static int hardware_enable_all(void)
+{
+ int r = 0;
+
+ spin_lock(&kvm_lock);
+
+ kvm_usage_count++;
+ if (kvm_usage_count == 1) {
+ atomic_set(&hardware_enable_failed, 0);
+ kvm_on_each_cpu(hardware_enable_nolock, NULL, 1);
+
+ if (atomic_read(&hardware_enable_failed)) {
+ hardware_disable_all_nolock();
+ r = -EBUSY;
+ }
+ }
+
+ spin_unlock(&kvm_lock);
+
+ return r;
+}
+
+static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
+ void *v)
+{
+ int cpu = (long)v;
+
+ if (!kvm_usage_count)
+ return NOTIFY_OK;
+
+ val &= ~CPU_TASKS_FROZEN;
+ switch (val) {
+ case CPU_DYING:
+ printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+ cpu);
+ hardware_disable(NULL);
+ break;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
+ case CPU_STARTING:
+#else
+ case CPU_ONLINE:
+#endif
+ printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
+ cpu);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
+ hardware_enable(NULL);
+#else
+ smp_call_function_single(cpu, hardware_enable, NULL, 1);
+#endif
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+
+asmlinkage void kvm_spurious_fault(void)
+{
+ /* Fault while not rebooting. We want the trace. */
+ BUG();
+}
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+
+static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
+ void *v)
+{
+ /*
+ * Some (well, at least mine) BIOSes hang on reboot if
+ * in vmx root mode.
+ *
+ * And Intel TXT required VMX off for all cpu when system shutdown.
+ */
+ printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+ kvm_rebooting = true;
+ kvm_on_each_cpu(hardware_disable_nolock, NULL, 1);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_reboot_notifier = {
+ .notifier_call = kvm_reboot,
+ .priority = 0,
+};
+
+static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+{
+ int i;
+
+ for (i = 0; i < bus->dev_count; i++) {
+ struct kvm_io_device *pos = bus->devs[i];
+
+ kvm_iodevice_destructor(pos);
+ }
+ kfree(bus);
+}
+
+/* kvm_io_bus_write - called under kvm->slots_lock */
+int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ int len, const void *val)
+{
+ int i;
+ struct kvm_io_bus *bus;
+
+ bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+ for (i = 0; i < bus->dev_count; i++)
+ if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+ return 0;
+ return -EOPNOTSUPP;
+}
+
+/* kvm_io_bus_read - called under kvm->slots_lock */
+int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ int len, void *val)
+{
+ int i;
+ struct kvm_io_bus *bus;
+
+ bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+ for (i = 0; i < bus->dev_count; i++)
+ if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
+ return 0;
+ return -EOPNOTSUPP;
+}
+
+/* Caller must hold slots_lock. */
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev)
+{
+ struct kvm_io_bus *new_bus, *bus;
+
+ bus = kvm->buses[bus_idx];
+ if (bus->dev_count > NR_IOBUS_DEVS-1)
+ return -ENOSPC;
+
+ new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+ if (!new_bus)
+ return -ENOMEM;
+ memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+ new_bus->devs[new_bus->dev_count++] = dev;
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ kvm_synchronize_srcu_expedited(&kvm->srcu);
+ kfree(bus);
+
+ return 0;
+}
+
+/* Caller must hold slots_lock. */
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev)
+{
+ int i, r;
+ struct kvm_io_bus *new_bus, *bus;
+
+ new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+ if (!new_bus)
+ return -ENOMEM;
+
+ bus = kvm->buses[bus_idx];
+ memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+
+ r = -ENOENT;
+ for (i = 0; i < new_bus->dev_count; i++)
+ if (new_bus->devs[i] == dev) {
+ r = 0;
+ new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
+ break;
+ }
+
+ if (r) {
+ kfree(new_bus);
+ return r;
+ }
+
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ kvm_synchronize_srcu_expedited(&kvm->srcu);
+ kfree(bus);
+ return r;
+}
+
+static struct notifier_block kvm_cpu_notifier = {
+ .notifier_call = kvm_cpu_hotplug,
+};
+
+static int __vm_stat_get(void *_offset, u64 *val)
+{
+ unsigned offset = (long)_offset;
+ struct kvm *kvm;
+
+ *val = 0;
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ *val += *(u32 *)((void *)kvm + offset);
+ spin_unlock(&kvm_lock);
+ return 0;
+}
+
+MAKE_SIMPLE_ATTRIBUTE_GETTER(vm_stat_get)
+DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
+
+static int __vcpu_stat_get(void *_offset, u64 *val)
+{
+ unsigned offset = (long)_offset;
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ *val = 0;
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ *val += *(u32 *)((void *)vcpu + offset);
+
+ spin_unlock(&kvm_lock);
+ return 0;
+}
+
+MAKE_SIMPLE_ATTRIBUTE_GETTER(vcpu_stat_get)
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
+
+static const struct file_operations *stat_fops[] = {
+ [KVM_STAT_VCPU] = &vcpu_stat_fops,
+ [KVM_STAT_VM] = &vm_stat_fops,
+};
+
+static void kvm_init_debug(void)
+{
+ struct kvm_stats_debugfs_item *p;
+
+ kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
+ for (p = debugfs_entries; p->name; ++p)
+ p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
+ (void *)(long)p->offset,
+ stat_fops[p->kind]);
+}
+
+static void kvm_exit_debug(void)
+{
+ struct kvm_stats_debugfs_item *p;
+
+ for (p = debugfs_entries; p->name; ++p)
+ debugfs_remove(p->dentry);
+ debugfs_remove(kvm_debugfs_dir);
+}
+
+static int kvm_suspend(struct sys_device *dev, pm_message_t state)
+{
+ if (kvm_usage_count)
+ hardware_disable_nolock(NULL);
+ return 0;
+}
+
+static int kvm_resume(struct sys_device *dev)
+{
+ if (kvm_usage_count) {
+ WARN_ON(spin_is_locked(&kvm_lock));
+ hardware_enable_nolock(NULL);
+ }
+ return 0;
+}
+
+static struct sysdev_class kvm_sysdev_class = {
+ set_kset_name("kvm"),
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+
+static struct sys_device kvm_sysdev = {
+ .id = 0,
+ .cls = &kvm_sysdev_class,
+};
+
+struct page *bad_page;
+pfn_t bad_pfn;
+
+static inline
+struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
+{
+ return container_of(pn, struct kvm_vcpu, preempt_notifier);
+}
+
+static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
+{
+ struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+ kvm_arch_vcpu_load(vcpu, cpu);
+}
+
+static void kvm_sched_out(struct preempt_notifier *pn,
+ struct task_struct *next)
+{
+ struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+ kvm_arch_vcpu_put(vcpu);
+ kvm_fire_urn();
+}
+
+int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+ struct module *module)
+{
+ int r;
+ int cpu;
+
+ r = kvm_init_srcu();
+ if (r)
+ return r;
+
+ preempt_notifier_sys_init();
+
+ r = kvm_arch_init(opaque);
+ if (r)
+ goto out_fail;
+
+ bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+ if (bad_page == NULL) {
+ r = -ENOMEM;
+ goto out;
+ }
+
+ bad_pfn = page_to_pfn(bad_page);
+
+ hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+ if (hwpoison_page == NULL) {
+ r = -ENOMEM;
+ goto out_free_0;
+ }
+
+ hwpoison_pfn = page_to_pfn(hwpoison_page);
+
+ fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+ if (fault_page == NULL) {
+ r = -ENOMEM;
+ goto out_free_0;
+ }
+
+ fault_pfn = page_to_pfn(fault_page);
+
+ if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
+ r = -ENOMEM;
+ goto out_free_0;
+ }
+
+ r = kvm_arch_hardware_setup();
+ if (r < 0)
+ goto out_free_0a;
+
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu,
+ kvm_arch_check_processor_compat,
+ &r, 1);
+ if (r < 0)
+ goto out_free_1;
+ }
+
+ r = register_cpu_notifier(&kvm_cpu_notifier);
+ if (r)
+ goto out_free_2;
+ register_reboot_notifier(&kvm_reboot_notifier);
+
+ r = sysdev_class_register(&kvm_sysdev_class);
+ if (r)
+ goto out_free_3;
+
+ r = sysdev_register(&kvm_sysdev);
+ if (r)
+ goto out_free_4;
+
+ /* A kmem cache lets us meet the alignment requirements of fx_save. */
+ if (!vcpu_align)
+ vcpu_align = __alignof__(struct kvm_vcpu);
+ kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
+ 0, NULL);
+ if (!kvm_vcpu_cache) {
+ r = -ENOMEM;
+ goto out_free_5;
+ }
+
+ r = kvm_async_pf_init();
+ if (r)
+ goto out_free;
+
+ kvm_chardev_ops.owner = module;
+IF_ANON_INODES_DOES_REFCOUNTS( kvm_vm_fops.owner = module;)
+IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;)
+
+ r = misc_register(&kvm_dev);
+ if (r) {
+ printk(KERN_ERR "kvm: misc device register failed\n");
+ goto out_unreg;
+ }
+
+ kvm_preempt_ops.sched_in = kvm_sched_in;
+ kvm_preempt_ops.sched_out = kvm_sched_out;
+
+ kvm_init_debug();
+
+ printk("loaded kvm module (kvm-kmod-2.6.38-rc7)\n");
+
+ kvm_clock_warn_suspend_bug();
+
+ return 0;
+
+out_unreg:
+ kvm_async_pf_deinit();
+out_free:
+ kmem_cache_destroy(kvm_vcpu_cache);
+out_free_5:
+ sysdev_unregister(&kvm_sysdev);
+out_free_4:
+ sysdev_class_unregister(&kvm_sysdev_class);
+out_free_3:
+ unregister_reboot_notifier(&kvm_reboot_notifier);
+ unregister_cpu_notifier(&kvm_cpu_notifier);
+out_free_2:
+out_free_1:
+ kvm_arch_hardware_unsetup();
+out_free_0a:
+ free_cpumask_var(cpus_hardware_enabled);
+out_free_0:
+ if (fault_page)
+ __free_page(fault_page);
+ if (hwpoison_page)
+ __free_page(hwpoison_page);
+ __free_page(bad_page);
+out:
+ kvm_arch_exit();
+out_fail:
+ preempt_notifier_sys_exit();
+ kvm_exit_srcu();
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_init);
+
+void kvm_exit(void)
+{
+ kvm_exit_debug();
+ misc_deregister(&kvm_dev);
+ kmem_cache_destroy(kvm_vcpu_cache);
+ kvm_async_pf_deinit();
+ sysdev_unregister(&kvm_sysdev);
+ sysdev_class_unregister(&kvm_sysdev_class);
+ unregister_reboot_notifier(&kvm_reboot_notifier);
+ unregister_cpu_notifier(&kvm_cpu_notifier);
+ kvm_on_each_cpu(hardware_disable_nolock, NULL, 1);
+ kvm_arch_hardware_unsetup();
+ kvm_arch_exit();
+ free_cpumask_var(cpus_hardware_enabled);
+ __free_page(hwpoison_page);
+ __free_page(bad_page);
+ preempt_notifier_sys_exit();
+ kvm_exit_srcu();
+}
+EXPORT_SYMBOL_GPL(kvm_exit);
diff --git a/linux/x86/kvm_timer.h b/linux/x86/kvm_timer.h
new file mode 100644
index 0000000..64bc6ea
--- /dev/null
+++ b/linux/x86/kvm_timer.h
@@ -0,0 +1,16 @@
+
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ atomic_t pending; /* accumulated triggered timers */
+ bool reinject;
+ struct kvm_timer_ops *t_ops;
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+};
+
+struct kvm_timer_ops {
+ bool (*is_periodic)(struct kvm_timer *);
+};
+
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
diff --git a/linux/x86/lapic.c b/linux/x86/lapic.c
new file mode 100644
index 0000000..d3ddaa4
--- /dev/null
+++ b/linux/x86/lapic.c
@@ -0,0 +1,1328 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+
+/*
+ * Local APIC virtualization
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2007 Novell
+ * Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Dor Laor <dor.laor@qumranet.com>
+ * Gregory Haskins <ghaskins@novell.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *
+ * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/apicdef.h>
+#include <asm/atomic.h>
+#include "kvm_cache_regs.h"
+#include "irq.h"
+#include "trace.h"
+#include "x86.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+#define PRId64 "d"
+#define PRIx64 "llx"
+#define PRIu64 "u"
+#define PRIo64 "o"
+
+#define APIC_BUS_CYCLE_NS 1
+
+/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+#define apic_debug(fmt, arg...)
+
+#define APIC_LVT_NUM 6
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
+#define LAPIC_MMIO_LENGTH (1 << 12)
+/* followed define is not in apicdef.h */
+#define APIC_SHORT_MASK 0xc0000
+#define APIC_DEST_NOSHORT 0x0
+#define APIC_DEST_MASK 0x800
+#define MAX_APIC_VECTOR 256
+
+#define VEC_POS(v) ((v) & (32 - 1))
+#define REG_POS(v) (((v) >> 5) << 4)
+
+static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+ return *((u32 *) (apic->regs + reg_off));
+}
+
+static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+ *((u32 *) (apic->regs + reg_off)) = val;
+}
+
+static inline int apic_test_and_set_vector(int vec, void *bitmap)
+{
+ return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int apic_test_and_clear_vector(int vec, void *bitmap)
+{
+ return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void apic_set_vector(int vec, void *bitmap)
+{
+ set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void apic_clear_vector(int vec, void *bitmap)
+{
+ clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int apic_hw_enabled(struct kvm_lapic *apic)
+{
+ return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+}
+
+static inline int apic_sw_enabled(struct kvm_lapic *apic)
+{
+ return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+}
+
+static inline int apic_enabled(struct kvm_lapic *apic)
+{
+ return apic_sw_enabled(apic) && apic_hw_enabled(apic);
+}
+
+#define LVT_MASK \
+ (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+
+#define LINT_MASK \
+ (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
+ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+
+static inline int kvm_apic_id(struct kvm_lapic *apic)
+{
+ return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+}
+
+static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
+{
+ return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+}
+
+static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
+{
+ return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+}
+
+static inline int apic_lvtt_period(struct kvm_lapic *apic)
+{
+ return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+}
+
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+ return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
+
+void kvm_apic_set_version(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_cpuid_entry2 *feat;
+ u32 v = APIC_VERSION;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
+ if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
+ v |= APIC_LVR_DIRECTED_EOI;
+ apic_set_reg(apic, APIC_LVR, v);
+}
+
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
+static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
+ LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
+ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
+ LVT_MASK | APIC_MODE_MASK, /* LVTPC */
+ LINT_MASK, LINT_MASK, /* LVT0-1 */
+ LVT_MASK /* LVTERR */
+};
+
+static int find_highest_vector(void *bitmap)
+{
+ u32 *word = bitmap;
+ int word_offset = MAX_APIC_VECTOR >> 5;
+
+ while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
+ continue;
+
+ if (likely(!word_offset && !word[0]))
+ return -1;
+ else
+ return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
+}
+
+static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+{
+ apic->irr_pending = true;
+ return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+}
+
+static inline int apic_search_irr(struct kvm_lapic *apic)
+{
+ return find_highest_vector(apic->regs + APIC_IRR);
+}
+
+static inline int apic_find_highest_irr(struct kvm_lapic *apic)
+{
+ int result;
+
+ if (!apic->irr_pending)
+ return -1;
+
+ result = apic_search_irr(apic);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+ apic->irr_pending = false;
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
+}
+
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int highest_irr;
+
+ /* This may race with setting of irr in __apic_accept_irq() and
+ * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
+ * will cause vmexit immediately and the value will be recalculated
+ * on the next vmentry.
+ */
+ if (!apic)
+ return 0;
+ highest_irr = apic_find_highest_irr(apic);
+
+ return highest_irr;
+}
+
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode);
+
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
+ irq->level, irq->trig_mode);
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+ int result;
+
+ result = find_highest_vector(apic->regs + APIC_ISR);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+ u32 tpr, isrv, ppr, old_ppr;
+ int isr;
+
+ old_ppr = apic_get_reg(apic, APIC_PROCPRI);
+ tpr = apic_get_reg(apic, APIC_TASKPRI);
+ isr = apic_find_highest_isr(apic);
+ isrv = (isr != -1) ? isr : 0;
+
+ if ((tpr & 0xf0) >= (isrv & 0xf0))
+ ppr = tpr & 0xff;
+ else
+ ppr = isrv & 0xf0;
+
+ apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
+ apic, ppr, isr, isrv);
+
+ if (old_ppr != ppr) {
+ apic_set_reg(apic, APIC_PROCPRI, ppr);
+ if (ppr < old_ppr)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ }
+}
+
+static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
+{
+ apic_set_reg(apic, APIC_TASKPRI, tpr);
+ apic_update_ppr(apic);
+}
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
+{
+ return dest == 0xff || kvm_apic_id(apic) == dest;
+}
+
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
+{
+ int result = 0;
+ u32 logical_id;
+
+ if (apic_x2apic_mode(apic)) {
+ logical_id = apic_get_reg(apic, APIC_LDR);
+ return logical_id & mda;
+ }
+
+ logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+
+ switch (apic_get_reg(apic, APIC_DFR)) {
+ case APIC_DFR_FLAT:
+ if (logical_id & mda)
+ result = 1;
+ break;
+ case APIC_DFR_CLUSTER:
+ if (((logical_id >> 4) == (mda >> 0x4))
+ && (logical_id & mda & 0xf))
+ result = 1;
+ break;
+ default:
+ printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
+ apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+ break;
+ }
+
+ return result;
+}
+
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int short_hand, int dest, int dest_mode)
+{
+ int result = 0;
+ struct kvm_lapic *target = vcpu->arch.apic;
+
+ apic_debug("target %p, source %p, dest 0x%x, "
+ "dest_mode 0x%x, short_hand 0x%x\n",
+ target, source, dest, dest_mode, short_hand);
+
+ ASSERT(target);
+ switch (short_hand) {
+ case APIC_DEST_NOSHORT:
+ if (dest_mode == 0)
+ /* Physical mode. */
+ result = kvm_apic_match_physical_addr(target, dest);
+ else
+ /* Logical mode. */
+ result = kvm_apic_match_logical_addr(target, dest);
+ break;
+ case APIC_DEST_SELF:
+ result = (target == source);
+ break;
+ case APIC_DEST_ALLINC:
+ result = 1;
+ break;
+ case APIC_DEST_ALLBUT:
+ result = (target != source);
+ break;
+ default:
+ printk(KERN_WARNING "Bad dest shorthand value %x\n",
+ short_hand);
+ break;
+ }
+
+ return result;
+}
+
+/*
+ * Add a pending IRQ into lapic.
+ * Return 1 if successfully added and 0 if discarded.
+ */
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode)
+{
+ int result = 0;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+
+ switch (delivery_mode) {
+ case APIC_DM_LOWEST:
+ vcpu->arch.apic_arb_prio++;
+ case APIC_DM_FIXED:
+ /* FIXME add logic for vcpu on reset */
+ if (unlikely(!apic_enabled(apic)))
+ break;
+
+ if (trig_mode) {
+ apic_debug("level trig mode for vector %d", vector);
+ apic_set_vector(vector, apic->regs + APIC_TMR);
+ } else
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
+
+ result = !apic_test_and_set_irr(vector, apic);
+ trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector, !result);
+ if (!result) {
+ if (trig_mode)
+ apic_debug("level trig mode repeatedly for "
+ "vector %d", vector);
+ break;
+ }
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_REMRD:
+ printk(KERN_DEBUG "Ignoring delivery mode 3\n");
+ break;
+
+ case APIC_DM_SMI:
+ printk(KERN_DEBUG "Ignoring guest SMI\n");
+ break;
+
+ case APIC_DM_NMI:
+ result = 1;
+ kvm_inject_nmi(vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_INIT:
+ if (level) {
+ result = 1;
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+ printk(KERN_DEBUG
+ "INIT on a runnable vcpu %d\n",
+ vcpu->vcpu_id);
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+ vcpu->vcpu_id);
+ }
+ break;
+
+ case APIC_DM_STARTUP:
+ apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+ vcpu->vcpu_id, vector);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ result = 1;
+ vcpu->arch.sipi_vector = vector;
+ vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+ break;
+
+ case APIC_DM_EXTINT:
+ /*
+ * Should only be called by kvm_apic_local_deliver() with LVT0,
+ * before NMI watchdog was enabled. Already handled by
+ * kvm_apic_accept_pic_intr().
+ */
+ break;
+
+ default:
+ printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
+ delivery_mode);
+ break;
+ }
+ return result;
+}
+
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
+{
+ return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
+}
+
+static void apic_set_eoi(struct kvm_lapic *apic)
+{
+ int vector = apic_find_highest_isr(apic);
+ int trigger_mode;
+ /*
+ * Not every write EOI will has corresponding ISR,
+ * one example is when Kernel check timer on setup_IO_APIC
+ */
+ if (vector == -1)
+ return;
+
+ apic_clear_vector(vector, apic->regs + APIC_ISR);
+ apic_update_ppr(apic);
+
+ if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+ if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
+ kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+
+static void apic_send_ipi(struct kvm_lapic *apic)
+{
+ u32 icr_low = apic_get_reg(apic, APIC_ICR);
+ u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+ struct kvm_lapic_irq irq;
+
+ irq.vector = icr_low & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr_low & APIC_MODE_MASK;
+ irq.dest_mode = icr_low & APIC_DEST_MASK;
+ irq.level = icr_low & APIC_INT_ASSERT;
+ irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
+ irq.shorthand = icr_low & APIC_SHORT_MASK;
+ if (apic_x2apic_mode(apic))
+ irq.dest_id = icr_high;
+ else
+ irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+
+ trace_kvm_apic_ipi(icr_low, irq.dest_id);
+
+ apic_debug("icr_high 0x%x, icr_low 0x%x, "
+ "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
+ "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+ icr_high, icr_low, irq.shorthand, irq.dest_id,
+ irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
+ irq.vector);
+
+ kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+}
+
+static u32 apic_get_tmcct(struct kvm_lapic *apic)
+{
+ ktime_t remaining;
+ s64 ns;
+ u32 tmcct;
+
+ ASSERT(apic != NULL);
+
+ /* if initial count is 0, current count should also be 0 */
+ if (apic_get_reg(apic, APIC_TMICT) == 0)
+ return 0;
+
+ remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = ktime_set(0, 0);
+
+ ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
+ tmcct = div64_u64(ns,
+ (APIC_BUS_CYCLE_NS * apic->divide_count));
+
+ return tmcct;
+}
+
+static void __report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct kvm_run *run = vcpu->run;
+
+ kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
+ run->tpr_access.rip = kvm_rip_read(vcpu);
+ run->tpr_access.is_write = write;
+}
+
+static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+ if (apic->vcpu->arch.tpr_access_reporting)
+ __report_tpr_access(apic, write);
+}
+
+static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
+{
+ u32 val = 0;
+
+ if (offset >= LAPIC_MMIO_LENGTH)
+ return 0;
+
+ switch (offset) {
+ case APIC_ID:
+ if (apic_x2apic_mode(apic))
+ val = kvm_apic_id(apic);
+ else
+ val = kvm_apic_id(apic) << 24;
+ break;
+ case APIC_ARBPRI:
+ printk(KERN_WARNING "Access APIC ARBPRI register "
+ "which is for P6\n");
+ break;
+
+ case APIC_TMCCT: /* Timer CCR */
+ val = apic_get_tmcct(apic);
+ break;
+
+ case APIC_TASKPRI:
+ report_tpr_access(apic, false);
+ /* fall thru */
+ default:
+ apic_update_ppr(apic);
+ val = apic_get_reg(apic, offset);
+ break;
+ }
+
+ return val;
+}
+
+static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_lapic, dev);
+}
+
+static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
+{
+ unsigned char alignment = offset & 0xf;
+ u32 result;
+ /* this bitmask has a bit cleared for each reserver register */
+ static const u64 rmask = 0x43ff01ffffffe70cULL;
+
+ if ((alignment + len) > 4) {
+ apic_debug("KVM_APIC_READ: alignment error %x %d\n",
+ offset, len);
+ return 1;
+ }
+
+ if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
+ apic_debug("KVM_APIC_READ: read reserved register %x\n",
+ offset);
+ return 1;
+ }
+
+ result = __apic_read(apic, offset & ~0xf);
+
+ trace_kvm_apic_read(offset, result);
+
+ switch (len) {
+ case 1:
+ case 2:
+ case 4:
+ memcpy(data, (char *)&result + alignment, len);
+ break;
+ default:
+ printk(KERN_ERR "Local APIC read with len = %x, "
+ "should be 1,2, or 4 instead\n", len);
+ break;
+ }
+ return 0;
+}
+
+static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
+{
+ return apic_hw_enabled(apic) &&
+ addr >= apic->base_address &&
+ addr < apic->base_address + LAPIC_MMIO_LENGTH;
+}
+
+static int apic_mmio_read(struct kvm_io_device *this,
+ gpa_t address, int len, void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ u32 offset = address - apic->base_address;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ apic_reg_read(apic, offset, len, data);
+
+ return 0;
+}
+
+static void update_divide_count(struct kvm_lapic *apic)
+{
+ u32 tmp1, tmp2, tdcr;
+
+ tdcr = apic_get_reg(apic, APIC_TDCR);
+ tmp1 = tdcr & 0xf;
+ tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
+ apic->divide_count = 0x1 << (tmp2 & 0x7);
+
+ apic_debug("timer divide count is 0x%x\n",
+ apic->divide_count);
+}
+
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+ ktime_t now = apic->lapic_timer.timer.base->get_time();
+
+ apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
+ APIC_BUS_CYCLE_NS * apic->divide_count;
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ if (!apic->lapic_timer.period)
+ return;
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic)) {
+ if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
+ apic->lapic_timer.period = NSEC_PER_MSEC/2;
+ }
+
+ hrtimer_start(&apic->lapic_timer.timer,
+ ktime_add_ns(now, apic->lapic_timer.period),
+ HRTIMER_MODE_ABS);
+
+ apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+ PRIx64 ", "
+ "timer initial count 0x%x, period %lldns, "
+ "expire @ 0x%016" PRIx64 ".\n", __func__,
+ APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+ apic_get_reg(apic, APIC_TMICT),
+ apic->lapic_timer.period,
+ ktime_to_ns(ktime_add_ns(now,
+ apic->lapic_timer.period)));
+}
+
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+ int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+
+ if (apic_lvt_nmi_mode(lvt0_val)) {
+ if (!nmi_wd_enabled) {
+ apic_debug("Receive NMI setting on APIC_LVT0 "
+ "for cpu %d\n", apic->vcpu->vcpu_id);
+ apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+ }
+ } else if (nmi_wd_enabled)
+ apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+}
+
+static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+{
+ int ret = 0;
+
+ trace_kvm_apic_write(reg, val);
+
+ switch (reg) {
+ case APIC_ID: /* Local APIC ID */
+ if (!apic_x2apic_mode(apic))
+ apic_set_reg(apic, APIC_ID, val);
+ else
+ ret = 1;
+ break;
+
+ case APIC_TASKPRI:
+ report_tpr_access(apic, true);
+ apic_set_tpr(apic, val & 0xff);
+ break;
+
+ case APIC_EOI:
+ apic_set_eoi(apic);
+ break;
+
+ case APIC_LDR:
+ if (!apic_x2apic_mode(apic))
+ apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+ else
+ ret = 1;
+ break;
+
+ case APIC_DFR:
+ if (!apic_x2apic_mode(apic))
+ apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+ else
+ ret = 1;
+ break;
+
+ case APIC_SPIV: {
+ u32 mask = 0x3ff;
+ if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ mask |= APIC_SPIV_DIRECTED_EOI;
+ apic_set_reg(apic, APIC_SPIV, val & mask);
+ if (!(val & APIC_SPIV_APIC_ENABLED)) {
+ int i;
+ u32 lvt_val;
+
+ for (i = 0; i < APIC_LVT_NUM; i++) {
+ lvt_val = apic_get_reg(apic,
+ APIC_LVTT + 0x10 * i);
+ apic_set_reg(apic, APIC_LVTT + 0x10 * i,
+ lvt_val | APIC_LVT_MASKED);
+ }
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ }
+ break;
+ }
+ case APIC_ICR:
+ /* No delay here, so we always clear the pending bit */
+ apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
+ apic_send_ipi(apic);
+ break;
+
+ case APIC_ICR2:
+ if (!apic_x2apic_mode(apic))
+ val &= 0xff000000;
+ apic_set_reg(apic, APIC_ICR2, val);
+ break;
+
+ case APIC_LVT0:
+ apic_manage_nmi_watchdog(apic, val);
+ case APIC_LVTT:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ /* TODO: Check vector */
+ if (!apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+
+ val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
+ apic_set_reg(apic, reg, val);
+
+ break;
+
+ case APIC_TMICT:
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ apic_set_reg(apic, APIC_TMICT, val);
+ start_apic_timer(apic);
+ break;
+
+ case APIC_TDCR:
+ if (val & 4)
+ printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
+ apic_set_reg(apic, APIC_TDCR, val);
+ update_divide_count(apic);
+ break;
+
+ case APIC_ESR:
+ if (apic_x2apic_mode(apic) && val != 0) {
+ printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val);
+ ret = 1;
+ }
+ break;
+
+ case APIC_SELF_IPI:
+ if (apic_x2apic_mode(apic)) {
+ apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
+ } else
+ ret = 1;
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ if (ret)
+ apic_debug("Local APIC Write to read-only register %x\n", reg);
+ return ret;
+}
+
+static int apic_mmio_write(struct kvm_io_device *this,
+ gpa_t address, int len, const void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ unsigned int offset = address - apic->base_address;
+ u32 val;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ /*
+ * APIC register must be aligned on 128-bits boundary.
+ * 32/64/128 bits registers must be accessed thru 32 bits.
+ * Refer SDM 8.4.1
+ */
+ if (len != 4 || (offset & 0xf)) {
+ /* Don't shout loud, $infamous_os would cause only noise. */
+ apic_debug("apic write: bad size=%d %lx\n", len, (long)address);
+ return 0;
+ }
+
+ val = *(u32*)data;
+
+ /* too common printing */
+ if (offset != APIC_EOI)
+ apic_debug("%s: offset 0x%x with length 0x%x, and value is "
+ "0x%x\n", __func__, offset, len, val);
+
+ apic_reg_write(apic, offset & 0xff0, val);
+
+ return 0;
+}
+
+void kvm_free_lapic(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->arch.apic)
+ return;
+
+ hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
+
+ if (vcpu->arch.apic->regs_page)
+ __free_page(vcpu->arch.apic->regs_page);
+
+ kfree(vcpu->arch.apic);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * LAPIC interface
+ *----------------------------------------------------------------------
+ */
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!apic)
+ return;
+ apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
+ | (apic_get_reg(apic, APIC_TASKPRI) & 4));
+}
+
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 tpr;
+
+ if (!apic)
+ return 0;
+ tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+
+ return (tpr & 0xf0) >> 4;
+}
+
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!apic) {
+ value |= MSR_IA32_APICBASE_BSP;
+ vcpu->arch.apic_base = value;
+ return;
+ }
+
+ if (!kvm_vcpu_is_bsp(apic->vcpu))
+ value &= ~MSR_IA32_APICBASE_BSP;
+
+ vcpu->arch.apic_base = value;
+ if (apic_x2apic_mode(apic)) {
+ u32 id = kvm_apic_id(apic);
+ u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
+ apic_set_reg(apic, APIC_LDR, ldr);
+ }
+ apic->base_address = apic->vcpu->arch.apic_base &
+ MSR_IA32_APICBASE_BASE;
+
+ /* with FSB delivery interrupt, we can restart APIC functionality */
+ apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
+ "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
+
+}
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic;
+ int i;
+
+ apic_debug("%s\n", __func__);
+
+ ASSERT(vcpu);
+ apic = vcpu->arch.apic;
+ ASSERT(apic != NULL);
+
+ /* Stop the timer in case it's a reset to an active apic */
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+ kvm_apic_set_version(apic->vcpu);
+
+ for (i = 0; i < APIC_LVT_NUM; i++)
+ apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+ apic_set_reg(apic, APIC_LVT0,
+ SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+
+ apic_set_reg(apic, APIC_DFR, 0xffffffffU);
+ apic_set_reg(apic, APIC_SPIV, 0xff);
+ apic_set_reg(apic, APIC_TASKPRI, 0);
+ apic_set_reg(apic, APIC_LDR, 0);
+ apic_set_reg(apic, APIC_ESR, 0);
+ apic_set_reg(apic, APIC_ICR, 0);
+ apic_set_reg(apic, APIC_ICR2, 0);
+ apic_set_reg(apic, APIC_TDCR, 0);
+ apic_set_reg(apic, APIC_TMICT, 0);
+ for (i = 0; i < 8; i++) {
+ apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+ apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+ apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+ }
+ apic->irr_pending = false;
+ update_divide_count(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
+ if (kvm_vcpu_is_bsp(vcpu))
+ vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ apic_update_ppr(apic);
+
+ vcpu->arch.apic_arb_prio = 0;
+
+ apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
+ "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
+ vcpu, kvm_apic_id(apic),
+ vcpu->arch.apic_base, apic->base_address);
+}
+
+bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
+}
+
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * timer interface
+ *----------------------------------------------------------------------
+ */
+
+static bool lapic_is_periodic(struct kvm_timer *ktimer)
+{
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
+ lapic_timer);
+ return apic_lvtt_period(apic);
+}
+
+int apic_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *lapic = vcpu->arch.apic;
+
+ if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
+ return atomic_read(&lapic->lapic_timer.pending);
+
+ return 0;
+}
+
+static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+{
+ u32 reg = apic_get_reg(apic, lvt_type);
+ int vector, mode, trig_mode;
+
+ if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ vector = reg & APIC_VECTOR_MASK;
+ mode = reg & APIC_MODE_MASK;
+ trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+ return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+ }
+ return 0;
+}
+
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic)
+ kvm_apic_local_deliver(apic, APIC_LVT0);
+}
+
+static struct kvm_timer_ops lapic_timer_ops = {
+ .is_periodic = lapic_is_periodic,
+};
+
+static const struct kvm_io_device_ops apic_mmio_ops = {
+ .read = apic_mmio_read,
+ .write = apic_mmio_write,
+};
+
+int kvm_create_lapic(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic;
+
+ ASSERT(vcpu != NULL);
+ apic_debug("apic_init %d\n", vcpu->vcpu_id);
+
+ apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+ if (!apic)
+ goto nomem;
+
+ vcpu->arch.apic = apic;
+
+ apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (apic->regs_page == NULL) {
+ printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
+ vcpu->vcpu_id);
+ goto nomem_free_apic;
+ }
+ apic->regs = page_address(apic->regs_page);
+ apic->vcpu = vcpu;
+
+ hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ apic->lapic_timer.timer.function = kvm_timer_fn;
+ apic->lapic_timer.t_ops = &lapic_timer_ops;
+ apic->lapic_timer.kvm = vcpu->kvm;
+ apic->lapic_timer.vcpu = vcpu;
+
+ apic->base_address = APIC_DEFAULT_PHYS_BASE;
+ vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+
+ kvm_lapic_reset(vcpu);
+ kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
+
+ return 0;
+nomem_free_apic:
+ kfree(apic);
+nomem:
+ return -ENOMEM;
+}
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int highest_irr;
+
+ if (!apic || !apic_enabled(apic))
+ return -1;
+
+ apic_update_ppr(apic);
+ highest_irr = apic_find_highest_irr(apic);
+ if ((highest_irr == -1) ||
+ ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+ return -1;
+ return highest_irr;
+}
+
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
+{
+ u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
+ int r = 0;
+
+ if (!apic_hw_enabled(vcpu->arch.apic))
+ r = 1;
+ if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+ GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+ r = 1;
+ return r;
+}
+
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
+ if (kvm_apic_local_deliver(apic, APIC_LVTT))
+ atomic_dec(&apic->lapic_timer.pending);
+ }
+}
+
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+{
+ int vector = kvm_apic_has_interrupt(vcpu);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (vector == -1)
+ return -1;
+
+ apic_set_vector(vector, apic->regs + APIC_ISR);
+ apic_update_ppr(apic);
+ apic_clear_irr(vector, apic);
+ return vector;
+}
+
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ apic->base_address = vcpu->arch.apic_base &
+ MSR_IA32_APICBASE_BASE;
+ kvm_apic_set_version(vcpu);
+
+ apic_update_ppr(apic);
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_divide_count(apic);
+ start_apic_timer(apic);
+ apic->irr_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct hrtimer *timer;
+
+ if (!apic)
+ return;
+
+ timer = &apic->lapic_timer.timer;
+ if (hrtimer_cancel(timer))
+ kvm_hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+}
+
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
+{
+ u32 data;
+ void *vapic;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+ return;
+
+ vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+ data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
+ kunmap_atomic(vapic, KM_USER0);
+
+ apic_set_tpr(vcpu->arch.apic, data & 0xff);
+}
+
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
+{
+ u32 data, tpr;
+ int max_irr, max_isr;
+ struct kvm_lapic *apic;
+ void *vapic;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+ return;
+
+ apic = vcpu->arch.apic;
+ tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+ max_irr = apic_find_highest_irr(apic);
+ if (max_irr < 0)
+ max_irr = 0;
+ max_isr = apic_find_highest_isr(apic);
+ if (max_isr < 0)
+ max_isr = 0;
+ data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
+
+ vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+ *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
+ kunmap_atomic(vapic, KM_USER0);
+}
+
+void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+{
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ vcpu->arch.apic->vapic_addr = vapic_addr;
+}
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (msr == 0x830)
+ apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return apic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ return 1;
+
+ if (apic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (msr == 0x830)
+ apic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (reg == APIC_ICR)
+ apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return apic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 low, high = 0;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return 1;
+
+ if (apic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (reg == APIC_ICR)
+ apic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
diff --git a/linux/x86/lapic.h b/linux/x86/lapic.h
new file mode 100644
index 0000000..f5fe32c
--- /dev/null
+++ b/linux/x86/lapic.h
@@ -0,0 +1,59 @@
+#ifndef __KVM_X86_LAPIC_H
+#define __KVM_X86_LAPIC_H
+
+#include "iodev.h"
+#include "kvm_timer.h"
+
+#include <linux/kvm_host.h>
+
+struct kvm_lapic {
+ unsigned long base_address;
+ struct kvm_io_device dev;
+ struct kvm_timer lapic_timer;
+ u32 divide_count;
+ struct kvm_vcpu *vcpu;
+ bool irr_pending;
+ struct page *regs_page;
+ void *regs;
+ gpa_t vapic_addr;
+ struct page *vapic_page;
+};
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_apic_set_version(struct kvm_vcpu *vcpu);
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+bool kvm_apic_present(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+
+void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
+}
+#endif
diff --git a/linux/x86/mmu.c b/linux/x86/mmu.c
new file mode 100644
index 0000000..9713914
--- /dev/null
+++ b/linux/x86/mmu.c
@@ -0,0 +1,3887 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "irq.h"
+#include "mmu.h"
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+
+#include <linux/kvm_host.h>
+#include <asm/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/hugetlb.h>
+
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/page.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+#include <asm/vmx.h>
+
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+bool tdp_enabled = false;
+
+enum {
+ AUDIT_PRE_PAGE_FAULT,
+ AUDIT_POST_PAGE_FAULT,
+ AUDIT_PRE_PTE_WRITE,
+ AUDIT_POST_PTE_WRITE,
+ AUDIT_PRE_SYNC,
+ AUDIT_POST_SYNC
+};
+
+char *audit_point_name[] = {
+ "pre page fault",
+ "post page fault",
+ "pre pte write",
+ "post pte write",
+ "pre sync",
+ "post sync"
+};
+
+#undef MMU_DEBUG
+
+#ifdef MMU_DEBUG
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+
+#else
+
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+
+#endif
+
+#ifdef MMU_DEBUG
+static int dbg = 0;
+module_param(dbg, bool, 0644);
+#endif
+
+static int oos_shadow = 1;
+module_param(oos_shadow, bool, 0644);
+
+#ifndef MMU_DEBUG
+#define ASSERT(x) do { } while (0)
+#else
+#define ASSERT(x) \
+ if (!(x)) { \
+ printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ }
+#endif
+
+#define PTE_PREFETCH_NUM 8
+
+#define PT_FIRST_AVAIL_BITS_SHIFT 9
+#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+ (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
+
+#define PT64_LEVEL_MASK(level) \
+ (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
+
+#define PT64_INDEX(address, level)\
+ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+
+
+#define PT32_LEVEL_BITS 10
+
+#define PT32_LEVEL_SHIFT(level) \
+ (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
+
+#define PT32_LEVEL_MASK(level) \
+ (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
+#define PT32_LVL_OFFSET_MASK(level) \
+ (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT32_LEVEL_BITS))) - 1))
+
+#define PT32_INDEX(address, level)\
+ (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
+
+
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_DIR_BASE_ADDR_MASK \
+ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+#define PT64_LVL_ADDR_MASK(level) \
+ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
+#define PT64_LVL_OFFSET_MASK(level) \
+ (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
+
+#define PT32_BASE_ADDR_MASK PAGE_MASK
+#define PT32_DIR_BASE_ADDR_MASK \
+ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+#define PT32_LVL_ADDR_MASK(level) \
+ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT32_LEVEL_BITS))) - 1))
+
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
+ | PT64_NX_MASK)
+
+#define RMAP_EXT 4
+
+#define ACC_EXEC_MASK 1
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
+#define ACC_USER_MASK PT_USER_MASK
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+#include <trace/events/kvm.h>
+
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+
+struct kvm_rmap_desc {
+ u64 *sptes[RMAP_EXT];
+ struct kvm_rmap_desc *more;
+};
+
+struct kvm_shadow_walk_iterator {
+ u64 addr;
+ hpa_t shadow_addr;
+ int level;
+ u64 *sptep;
+ unsigned index;
+};
+
+#define for_each_shadow_entry(_vcpu, _addr, _walker) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
+
+static struct kmem_cache *pte_chain_cache;
+static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
+
+static u64 __read_mostly shadow_trap_nonpresent_pte;
+static u64 __read_mostly shadow_notrap_nonpresent_pte;
+static u64 __read_mostly shadow_nx_mask;
+static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+static u64 __read_mostly shadow_user_mask;
+static u64 __read_mostly shadow_accessed_mask;
+static u64 __read_mostly shadow_dirty_mask;
+
+static inline u64 rsvd_bits(int s, int e)
+{
+ return ((1ULL << (e - s + 1)) - 1) << s;
+}
+
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+{
+ shadow_trap_nonpresent_pte = trap_pte;
+ shadow_notrap_nonpresent_pte = notrap_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
+
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+ u64 dirty_mask, u64 nx_mask, u64 x_mask)
+{
+ shadow_user_mask = user_mask;
+ shadow_accessed_mask = accessed_mask;
+ shadow_dirty_mask = dirty_mask;
+ shadow_nx_mask = nx_mask;
+ shadow_x_mask = x_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+
+static bool is_write_protection(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+}
+
+static int is_cpuid_PSE36(void)
+{
+ return 1;
+}
+
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.efer & EFER_NX;
+}
+
+static int is_shadow_present_pte(u64 pte)
+{
+ return pte != shadow_trap_nonpresent_pte
+ && pte != shadow_notrap_nonpresent_pte;
+}
+
+static int is_large_pte(u64 pte)
+{
+ return pte & PT_PAGE_SIZE_MASK;
+}
+
+static int is_writable_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+static int is_dirty_gpte(unsigned long pte)
+{
+ return pte & PT_DIRTY_MASK;
+}
+
+static int is_rmap_spte(u64 pte)
+{
+ return is_shadow_present_pte(pte);
+}
+
+static int is_last_spte(u64 pte, int level)
+{
+ if (level == PT_PAGE_TABLE_LEVEL)
+ return 1;
+ if (is_large_pte(pte))
+ return 1;
+ return 0;
+}
+
+static pfn_t spte_to_pfn(u64 pte)
+{
+ return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static gfn_t pse36_gfn_delta(u32 gpte)
+{
+ int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+ return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ kvm_set_64bit(sptep, spte);
+}
+
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
+#ifdef CONFIG_X86_64
+ return xchg(sptep, new_spte);
+#else
+ u64 old_spte;
+
+ do {
+ old_spte = *sptep;
+ } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+ return old_spte;
+#endif
+}
+
+static bool spte_has_volatile_bits(u64 spte)
+{
+ if (!shadow_accessed_mask)
+ return false;
+
+ if (!is_shadow_present_pte(spte))
+ return false;
+
+ if ((spte & shadow_accessed_mask) &&
+ (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
+ return false;
+
+ return true;
+}
+
+static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+ return (old_spte & bit_mask) && !(new_spte & bit_mask);
+}
+
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+ u64 mask, old_spte = *sptep;
+
+ WARN_ON(!is_rmap_spte(new_spte));
+
+ new_spte |= old_spte & shadow_dirty_mask;
+
+ mask = shadow_accessed_mask;
+ if (is_writable_pte(old_spte))
+ mask |= shadow_dirty_mask;
+
+ if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
+ __set_spte(sptep, new_spte);
+ else
+ old_spte = __xchg_spte(sptep, new_spte);
+
+ if (!shadow_accessed_mask)
+ return;
+
+ if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+ if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+ kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ struct kmem_cache *base_cache, int min)
+{
+ void *obj;
+
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = obj;
+ }
+ return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+ struct kmem_cache *cache)
+{
+ while (mc->nobjs)
+ kmem_cache_free(cache, mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
+ int min)
+{
+ struct page *page;
+
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = page_address(page);
+ }
+ return 0;
+}
+
+static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
+{
+ while (mc->nobjs)
+ free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
+ pte_chain_cache, 4);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
+ rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache, 4);
+out:
+ return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
+ mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+ size_t size)
+{
+ void *p;
+
+ BUG_ON(!mc->nobjs);
+ p = mc->objects[--mc->nobjs];
+ return p;
+}
+
+static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+{
+ return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
+ sizeof(struct kvm_pte_chain));
+}
+
+static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+{
+ kmem_cache_free(pte_chain_cache, pc);
+}
+
+static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+{
+ return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
+ sizeof(struct kvm_rmap_desc));
+}
+
+static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
+{
+ kmem_cache_free(rmap_desc_cache, rd);
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+ if (!sp->role.direct)
+ return sp->gfns[index];
+
+ return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+ if (sp->role.direct)
+ BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
+ else
+ sp->gfns[index] = gfn;
+}
+
+/*
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
+ */
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+ struct kvm_memory_slot *slot,
+ int level)
+{
+ unsigned long idx;
+
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+ return &slot->lpage_info[level - 2][idx];
+}
+
+static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm_lpage_info *linfo;
+ int i;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ for (i = PT_DIRECTORY_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->write_count += 1;
+ }
+}
+
+static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm_lpage_info *linfo;
+ int i;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ for (i = PT_DIRECTORY_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->write_count -= 1;
+ WARN_ON(linfo->write_count < 0);
+ }
+}
+
+static int has_wrprotected_page(struct kvm *kvm,
+ gfn_t gfn,
+ int level)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm_lpage_info *linfo;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (slot) {
+ linfo = lpage_info_slot(gfn, slot, level);
+ return linfo->write_count;
+ }
+
+ return 1;
+}
+
+static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned long page_size;
+ int i, ret = 0;
+
+ page_size = kvm_host_page_size(kvm, gfn);
+
+ for (i = PT_PAGE_TABLE_LEVEL;
+ i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
+ if (page_size >= KVM_HPAGE_SIZE(i))
+ ret = i;
+ else
+ break;
+ }
+
+ return ret;
+}
+
+static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+ struct kvm_memory_slot *slot;
+ slot = gfn_to_memslot(vcpu->kvm, large_gfn);
+ if (slot && slot->dirty_bitmap)
+ return true;
+ return false;
+}
+
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+ int host_level, level, max_level;
+
+ host_level = host_mapping_level(vcpu->kvm, large_gfn);
+
+ if (host_level == PT_PAGE_TABLE_LEVEL)
+ return host_level;
+
+ max_level = kvm_x86_ops->get_lpage_level() < host_level ?
+ kvm_x86_ops->get_lpage_level() : host_level;
+
+ for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
+ if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+ break;
+
+ return level - 1;
+}
+
+/*
+ * Take gfn and return the reverse mapping to it.
+ */
+
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm_lpage_info *linfo;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (likely(level == PT_PAGE_TABLE_LEVEL))
+ return &slot->rmap[gfn - slot->base_gfn];
+
+ linfo = lpage_info_slot(gfn, slot, level);
+
+ return &linfo->rmap_pde;
+}
+
+/*
+ * Reverse mapping data structures:
+ *
+ * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
+ * that points to page_address(page).
+ *
+ * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
+ * containing more mappings.
+ *
+ * Returns the number of rmap entries before the spte was added or zero if
+ * the spte was not added.
+ *
+ */
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_rmap_desc *desc;
+ unsigned long *rmapp;
+ int i, count = 0;
+
+ if (!is_rmap_spte(*spte))
+ return count;
+ sp = page_header(__pa(spte));
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+ if (!*rmapp) {
+ rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
+ *rmapp = (unsigned long)spte;
+ } else if (!(*rmapp & 1)) {
+ rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
+ desc = mmu_alloc_rmap_desc(vcpu);
+ desc->sptes[0] = (u64 *)*rmapp;
+ desc->sptes[1] = spte;
+ *rmapp = (unsigned long)desc | 1;
+ ++count;
+ } else {
+ rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ while (desc->sptes[RMAP_EXT-1] && desc->more) {
+ desc = desc->more;
+ count += RMAP_EXT;
+ }
+ if (desc->sptes[RMAP_EXT-1]) {
+ desc->more = mmu_alloc_rmap_desc(vcpu);
+ desc = desc->more;
+ }
+ for (i = 0; desc->sptes[i]; ++i)
+ ++count;
+ desc->sptes[i] = spte;
+ }
+ return count;
+}
+
+static void rmap_desc_remove_entry(unsigned long *rmapp,
+ struct kvm_rmap_desc *desc,
+ int i,
+ struct kvm_rmap_desc *prev_desc)
+{
+ int j;
+
+ for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
+ ;
+ desc->sptes[i] = desc->sptes[j];
+ desc->sptes[j] = NULL;
+ if (j != 0)
+ return;
+ if (!prev_desc && !desc->more)
+ *rmapp = (unsigned long)desc->sptes[0];
+ else
+ if (prev_desc)
+ prev_desc->more = desc->more;
+ else
+ *rmapp = (unsigned long)desc->more | 1;
+ mmu_free_rmap_desc(desc);
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+ struct kvm_rmap_desc *desc;
+ struct kvm_rmap_desc *prev_desc;
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ unsigned long *rmapp;
+ int i;
+
+ sp = page_header(__pa(spte));
+ gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+ rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+ if (!*rmapp) {
+ printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
+ BUG();
+ } else if (!(*rmapp & 1)) {
+ rmap_printk("rmap_remove: %p 1->0\n", spte);
+ if ((u64 *)*rmapp != spte) {
+ printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
+ BUG();
+ }
+ *rmapp = 0;
+ } else {
+ rmap_printk("rmap_remove: %p many->many\n", spte);
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ prev_desc = NULL;
+ while (desc) {
+ for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
+ if (desc->sptes[i] == spte) {
+ rmap_desc_remove_entry(rmapp,
+ desc, i,
+ prev_desc);
+ return;
+ }
+ prev_desc = desc;
+ desc = desc->more;
+ }
+ pr_err("rmap_remove: %p many->many\n", spte);
+ BUG();
+ }
+}
+
+static int set_spte_track_bits(u64 *sptep, u64 new_spte)
+{
+ pfn_t pfn;
+ u64 old_spte = *sptep;
+
+ if (!spte_has_volatile_bits(old_spte))
+ __set_spte(sptep, new_spte);
+ else
+ old_spte = __xchg_spte(sptep, new_spte);
+
+ if (!is_rmap_spte(old_spte))
+ return 0;
+
+ pfn = spte_to_pfn(old_spte);
+ if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
+ kvm_set_pfn_dirty(pfn);
+ return 1;
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+ if (set_spte_track_bits(sptep, new_spte))
+ rmap_remove(kvm, sptep);
+}
+
+static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+{
+ struct kvm_rmap_desc *desc;
+ u64 *prev_spte;
+ int i;
+
+ if (!*rmapp)
+ return NULL;
+ else if (!(*rmapp & 1)) {
+ if (!spte)
+ return (u64 *)*rmapp;
+ return NULL;
+ }
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ prev_spte = NULL;
+ while (desc) {
+ for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
+ if (prev_spte == spte)
+ return desc->sptes[i];
+ prev_spte = desc->sptes[i];
+ }
+ desc = desc->more;
+ }
+ return NULL;
+}
+
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+ unsigned long *rmapp;
+ u64 *spte;
+ int i, write_protected = 0;
+
+ rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!spte);
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+ if (is_writable_pte(*spte)) {
+ update_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ write_protected = 1;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+
+ /* check for huge page mappings */
+ for (i = PT_DIRECTORY_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ rmapp = gfn_to_rmap(kvm, gfn, i);
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!spte);
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+ pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
+ if (is_writable_pte(*spte)) {
+ drop_spte(kvm, spte,
+ shadow_trap_nonpresent_pte);
+ --kvm->stat.lpages;
+ spte = NULL;
+ write_protected = 1;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ }
+
+ return write_protected;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ u64 *spte;
+ int need_tlb_flush = 0;
+
+ while ((spte = rmap_next(kvm, rmapp, NULL))) {
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ need_tlb_flush = 1;
+ }
+ return need_tlb_flush;
+}
+
+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ int need_flush = 0;
+ u64 *spte, new_spte;
+ pte_t *ptep = (pte_t *)data;
+ pfn_t new_pfn;
+
+ WARN_ON(pte_huge(*ptep));
+ new_pfn = pte_pfn(*ptep);
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!is_shadow_present_pte(*spte));
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+ need_flush = 1;
+ if (pte_write(*ptep)) {
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ spte = rmap_next(kvm, rmapp, NULL);
+ } else {
+ new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+ new_spte |= (u64)new_pfn << PAGE_SHIFT;
+
+ new_spte &= ~PT_WRITABLE_MASK;
+ new_spte &= ~SPTE_HOST_WRITEABLE;
+ new_spte &= ~shadow_accessed_mask;
+ set_spte_track_bits(spte, new_spte);
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ }
+ if (need_flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data))
+{
+ int i, j;
+ int ret;
+ int retval = 0;
+ struct kvm_memslots *slots;
+
+ slots = kvm_memslots(kvm);
+
+ for (i = 0; i < slots->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
+ unsigned long start = memslot->userspace_addr;
+ unsigned long end;
+
+ end = start + (memslot->npages << PAGE_SHIFT);
+ if (hva >= start && hva < end) {
+ gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ gfn_t gfn = memslot->base_gfn + gfn_offset;
+
+ ret = handler(kvm, &memslot->rmap[gfn_offset], data);
+
+ for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
+ struct kvm_lpage_info *linfo;
+
+ linfo = lpage_info_slot(gfn, memslot,
+ PT_DIRECTORY_LEVEL + j);
+ ret |= handler(kvm, &linfo->rmap_pde, data);
+ }
+ trace_kvm_age_page(hva, memslot, ret);
+ retval |= ret;
+ }
+ }
+
+ return retval;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ u64 *spte;
+ int young = 0;
+
+ /*
+ * Emulate the accessed bit for EPT, by checking if this page has
+ * an EPT mapping, and clearing it if it does. On the next access,
+ * a new EPT mapping will be established.
+ * This has some overhead, but not as much as the cost of swapping
+ * out actively used pages or breaking up actively used hugepages.
+ */
+ if (!shadow_accessed_mask)
+ return kvm_unmap_rmapp(kvm, rmapp, data);
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ int _young;
+ u64 _spte = *spte;
+ BUG_ON(!(_spte & PT_PRESENT_MASK));
+ _young = _spte & PT_ACCESSED_MASK;
+ if (_young) {
+ young = 1;
+ clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ return young;
+}
+
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ u64 *spte;
+ int young = 0;
+
+ /*
+ * If there's no access bit in the secondary pte set by the
+ * hardware it's up to gup-fast/gup to set the access bit in
+ * the primary pte or in the page structure.
+ */
+ if (!shadow_accessed_mask)
+ goto out;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ u64 _spte = *spte;
+ BUG_ON(!(_spte & PT_PRESENT_MASK));
+ young = _spte & PT_ACCESSED_MASK;
+ if (young) {
+ young = 1;
+ break;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+out:
+ return young;
+}
+
+#define RMAP_RECYCLE_THRESHOLD 1000
+
+static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+ unsigned long *rmapp;
+ struct kvm_mmu_page *sp;
+
+ sp = page_header(__pa(spte));
+
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+}
+
+#ifdef MMU_DEBUG
+static int is_empty_shadow_page(u64 *spt)
+{
+ u64 *pos;
+ u64 *end;
+
+ for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+ if (is_shadow_present_pte(*pos)) {
+ printk(KERN_ERR "%s: %p %llx\n", __func__,
+ pos, *pos);
+ return 0;
+ }
+ return 1;
+}
+#endif
+
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values. We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+{
+ kvm->arch.n_used_mmu_pages += nr;
+ percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
+
+static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ ASSERT(is_empty_shadow_page(sp->spt));
+ hlist_del(&sp->hash_link);
+ list_del(&sp->link);
+ __free_page(virt_to_page(sp->spt));
+ if (!sp->role.direct)
+ __free_page(virt_to_page(sp->gfns));
+ kmem_cache_free(mmu_page_header_cache, sp);
+ kvm_mod_used_mmu_pages(kvm, -1);
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+ return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+ u64 *parent_pte, int direct)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
+ sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+ if (!direct)
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+ PAGE_SIZE);
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+ bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+ sp->multimapped = 0;
+ sp->parent_pte = parent_pte;
+ kvm_mod_used_mmu_pages(vcpu->kvm, +1);
+ return sp;
+}
+
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
+
+ if (!parent_pte)
+ return;
+ if (!sp->multimapped) {
+ u64 *old = sp->parent_pte;
+
+ if (!old) {
+ sp->parent_pte = parent_pte;
+ return;
+ }
+ sp->multimapped = 1;
+ pte_chain = mmu_alloc_pte_chain(vcpu);
+ INIT_HLIST_HEAD(&sp->parent_ptes);
+ hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+ pte_chain->parent_ptes[0] = old;
+ }
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
+ if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
+ continue;
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
+ if (!pte_chain->parent_ptes[i]) {
+ pte_chain->parent_ptes[i] = parent_pte;
+ return;
+ }
+ }
+ pte_chain = mmu_alloc_pte_chain(vcpu);
+ BUG_ON(!pte_chain);
+ hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+ pte_chain->parent_ptes[0] = parent_pte;
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
+
+ if (!sp->multimapped) {
+ BUG_ON(sp->parent_pte != parent_pte);
+ sp->parent_pte = NULL;
+ return;
+ }
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ if (pte_chain->parent_ptes[i] != parent_pte)
+ continue;
+ while (i + 1 < NR_PTE_CHAIN_ENTRIES
+ && pte_chain->parent_ptes[i + 1]) {
+ pte_chain->parent_ptes[i]
+ = pte_chain->parent_ptes[i + 1];
+ ++i;
+ }
+ pte_chain->parent_ptes[i] = NULL;
+ if (i == 0) {
+ hlist_del(&pte_chain->link);
+ mmu_free_pte_chain(pte_chain);
+ if (hlist_empty(&sp->parent_ptes)) {
+ sp->multimapped = 0;
+ sp->parent_pte = NULL;
+ }
+ }
+ return;
+ }
+ BUG();
+}
+
+static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ struct kvm_mmu_page *parent_sp;
+ int i;
+
+ if (!sp->multimapped && sp->parent_pte) {
+ parent_sp = page_header(__pa(sp->parent_pte));
+ fn(parent_sp, sp->parent_pte);
+ return;
+ }
+
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ u64 *spte = pte_chain->parent_ptes[i];
+
+ if (!spte)
+ break;
+ parent_sp = page_header(__pa(spte));
+ fn(parent_sp, spte);
+ }
+}
+
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
+{
+ mmu_parent_walk(sp, mark_unsync);
+}
+
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
+{
+ unsigned int index;
+
+ index = spte - sp->spt;
+ if (__test_and_set_bit(index, sp->unsync_child_bitmap))
+ return;
+ if (sp->unsync_children++)
+ return;
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ sp->spt[i] = shadow_trap_nonpresent_pte;
+}
+
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ return 1;
+}
+
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+ struct mmu_page_and_offset {
+ struct kvm_mmu_page *sp;
+ unsigned int idx;
+ } page[KVM_PAGE_ARRAY_NR];
+ unsigned int nr;
+};
+
+#define for_each_unsync_children(bitmap, idx) \
+ for (idx = find_first_bit(bitmap, 512); \
+ idx < 512; \
+ idx = find_next_bit(bitmap, 512, idx+1))
+
+static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+ int idx)
+{
+ int i;
+
+ if (sp->unsync)
+ for (i=0; i < pvec->nr; i++)
+ if (pvec->page[i].sp == sp)
+ return 0;
+
+ pvec->page[pvec->nr].sp = sp;
+ pvec->page[pvec->nr].idx = idx;
+ pvec->nr++;
+ return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ int i, ret, nr_unsync_leaf = 0;
+
+ for_each_unsync_children(sp->unsync_child_bitmap, i) {
+ struct kvm_mmu_page *child;
+ u64 ent = sp->spt[i];
+
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent))
+ goto clear_child_bitmap;
+
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret)
+ goto clear_child_bitmap;
+ else if (ret > 0)
+ nr_unsync_leaf += ret;
+ else
+ return ret;
+ } else if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ } else
+ goto clear_child_bitmap;
+
+ continue;
+
+clear_child_bitmap:
+ __clear_bit(i, sp->unsync_child_bitmap);
+ sp->unsync_children--;
+ WARN_ON((int)sp->unsync_children < 0);
+ }
+
+
+ return nr_unsync_leaf;
+}
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ if (!sp->unsync_children)
+ return 0;
+
+ mmu_pages_add(pvec, sp, 0);
+ return __mmu_unsync_walk(sp, pvec);
+}
+
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ WARN_ON(!sp->unsync);
+ trace_kvm_mmu_sync_page(sp);
+ sp->unsync = 0;
+ --kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list);
+
+#define for_each_gfn_sp(kvm, sp, gfn, pos) \
+ hlist_for_each_entry(sp, pos, \
+ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
+ if ((sp)->gfn != (gfn)) {} else
+
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
+ hlist_for_each_entry(sp, pos, \
+ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
+ if ((sp)->gfn != (gfn) || (sp)->role.direct || \
+ (sp)->role.invalid) {} else
+
+/* @sp->gfn should be write-protected at the call site */
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list, bool clear_unsync)
+{
+ if (sp->role.cr4_pae != !!is_pae(vcpu)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+ return 1;
+ }
+
+ if (clear_unsync)
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+
+ if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+ return 1;
+ }
+
+ kvm_mmu_flush_tlb(vcpu);
+ return 0;
+}
+
+static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ LIST_HEAD(invalid_list);
+ int ret;
+
+ ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
+ if (ret)
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+ return ret;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ return __kvm_sync_page(vcpu, sp, invalid_list, true);
+}
+
+/* @gfn should be write-protected at the call site */
+static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!s->unsync)
+ continue;
+
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unlink_unsync_page(vcpu->kvm, s);
+ if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
+ (vcpu->arch.mmu.sync_page(vcpu, s))) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
+ continue;
+ }
+ flush = true;
+ }
+
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ if (flush)
+ kvm_mmu_flush_tlb(vcpu);
+}
+
+struct mmu_page_path {
+ struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
+ unsigned int idx[PT64_ROOT_LEVEL-1];
+};
+
+#define for_each_sp(pvec, sp, parents, i) \
+ for (i = mmu_pages_next(&pvec, &parents, -1), \
+ sp = pvec.page[i].sp; \
+ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
+ i = mmu_pages_next(&pvec, &parents, i))
+
+static int mmu_pages_next(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents,
+ int i)
+{
+ int n;
+
+ for (n = i+1; n < pvec->nr; n++) {
+ struct kvm_mmu_page *sp = pvec->page[n].sp;
+
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+ parents->idx[0] = pvec->page[n].idx;
+ return n;
+ }
+
+ parents->parent[sp->role.level-2] = sp;
+ parents->idx[sp->role.level-1] = pvec->page[n].idx;
+ }
+
+ return n;
+}
+
+static void mmu_pages_clear_parents(struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ unsigned int level = 0;
+
+ do {
+ unsigned int idx = parents->idx[level];
+
+ sp = parents->parent[level];
+ if (!sp)
+ return;
+
+ --sp->unsync_children;
+ WARN_ON((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+ level++;
+ } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
+}
+
+static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
+ struct mmu_page_path *parents,
+ struct kvm_mmu_pages *pvec)
+{
+ parents->parent[parent->role.level-1] = NULL;
+ pvec->nr = 0;
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+ LIST_HEAD(invalid_list);
+
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ while (mmu_unsync_walk(parent, &pages)) {
+ int protected = 0;
+
+ for_each_sp(pages, sp, parents, i)
+ protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+
+ if (protected)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_sync_page(vcpu, sp, &invalid_list);
+ mmu_pages_clear_parents(&parents);
+ }
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ cond_resched_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ }
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ gva_t gaddr,
+ unsigned level,
+ int direct,
+ unsigned access,
+ u64 *parent_pte)
+{
+ union kvm_mmu_page_role role;
+ unsigned quadrant;
+ struct kvm_mmu_page *sp;
+ struct hlist_node *node;
+ bool need_sync = false;
+
+ role = vcpu->arch.mmu.base_role;
+ role.level = level;
+ role.direct = direct;
+ if (role.direct)
+ role.cr4_pae = 0;
+ role.access = access;
+ if (!vcpu->arch.mmu.direct_map
+ && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+ quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+ role.quadrant = quadrant;
+ }
+ for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
+ if (!need_sync && sp->unsync)
+ need_sync = true;
+
+ if (sp->role.word != role.word)
+ continue;
+
+ if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
+ break;
+
+ mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ if (sp->unsync_children) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_mmu_mark_parents_unsync(sp);
+ } else if (sp->unsync)
+ kvm_mmu_mark_parents_unsync(sp);
+
+ trace_kvm_mmu_get_page(sp, false);
+ return sp;
+ }
+ ++vcpu->kvm->stat.mmu_cache_miss;
+ sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
+ if (!sp)
+ return sp;
+ sp->gfn = gfn;
+ sp->role = role;
+ hlist_add_head(&sp->hash_link,
+ &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
+ if (!direct) {
+ if (rmap_write_protect(vcpu->kvm, gfn))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ kvm_sync_pages(vcpu, gfn);
+
+ account_shadowed(vcpu->kvm, gfn);
+ }
+ if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
+ vcpu->arch.mmu.prefetch_page(vcpu, sp);
+ else
+ nonpaging_prefetch_page(vcpu, sp);
+ trace_kvm_mmu_get_page(sp, true);
+ return sp;
+}
+
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ iterator->addr = addr;
+ iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
+ iterator->level = vcpu->arch.mmu.shadow_root_level;
+
+ if (iterator->level == PT64_ROOT_LEVEL &&
+ vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+ !vcpu->arch.mmu.direct_map)
+ --iterator->level;
+
+ if (iterator->level == PT32E_ROOT_LEVEL) {
+ iterator->shadow_addr
+ = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
+ --iterator->level;
+ if (!iterator->shadow_addr)
+ iterator->level = 0;
+ }
+}
+
+static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
+{
+ if (iterator->level < PT_PAGE_TABLE_LEVEL)
+ return false;
+
+ if (iterator->level == PT_PAGE_TABLE_LEVEL)
+ if (is_large_pte(*iterator->sptep))
+ return false;
+
+ iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
+ iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
+ return true;
+}
+
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+ iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+ --iterator->level;
+}
+
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+ u64 spte;
+
+ spte = __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ __set_spte(sptep, spte);
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (is_large_pte(*sptep)) {
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ if (child->role.access == direct_access)
+ return;
+
+ mmu_page_remove_parent_pte(child, sptep);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
+static void kvm_mmu_page_unlink_children(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ unsigned i;
+ u64 *pt;
+ u64 ent;
+
+ pt = sp->spt;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ ent = pt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
+ ent &= PT64_BASE_ADDR_MASK;
+ mmu_page_remove_parent_pte(page_header(ent),
+ &pt[i]);
+ } else {
+ if (is_large_pte(ent))
+ --kvm->stat.lpages;
+ drop_spte(kvm, &pt[i],
+ shadow_trap_nonpresent_pte);
+ }
+ }
+ pt[i] = shadow_trap_nonpresent_pte;
+ }
+}
+
+static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+ mmu_page_remove_parent_pte(sp, parent_pte);
+}
+
+static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ vcpu->arch.last_pte_updated = NULL;
+}
+
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 *parent_pte;
+
+ while (sp->multimapped || sp->parent_pte) {
+ if (!sp->multimapped)
+ parent_pte = sp->parent_pte;
+ else {
+ struct kvm_pte_chain *chain;
+
+ chain = container_of(sp->parent_ptes.first,
+ struct kvm_pte_chain, link);
+ parent_pte = chain->parent_ptes[0];
+ }
+ BUG_ON(!parent_pte);
+ kvm_mmu_put_page(sp, parent_pte);
+ __set_spte(parent_pte, shadow_trap_nonpresent_pte);
+ }
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm,
+ struct kvm_mmu_page *parent,
+ struct list_head *invalid_list)
+{
+ int i, zapped = 0;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+
+ if (parent->role.level == PT_PAGE_TABLE_LEVEL)
+ return 0;
+
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ while (mmu_unsync_walk(parent, &pages)) {
+ struct kvm_mmu_page *sp;
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ mmu_pages_clear_parents(&parents);
+ zapped++;
+ }
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ }
+
+ return zapped;
+}
+
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int ret;
+
+ trace_kvm_mmu_prepare_zap_page(sp);
+ ++kvm->stat.mmu_shadow_zapped;
+ ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
+ kvm_mmu_page_unlink_children(kvm, sp);
+ kvm_mmu_unlink_parents(kvm, sp);
+ if (!sp->role.invalid && !sp->role.direct)
+ unaccount_shadowed(kvm, sp->gfn);
+ if (sp->unsync)
+ kvm_unlink_unsync_page(kvm, sp);
+ if (!sp->root_count) {
+ /* Count self */
+ ret++;
+ list_move(&sp->link, invalid_list);
+ } else {
+ list_move(&sp->link, &kvm->arch.active_mmu_pages);
+ kvm_reload_remote_mmus(kvm);
+ }
+
+ sp->role.invalid = 1;
+ kvm_mmu_reset_last_pte_updated(kvm);
+ return ret;
+}
+
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (list_empty(invalid_list))
+ return;
+
+ kvm_flush_remote_tlbs(kvm);
+
+ do {
+ sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ WARN_ON(!sp->role.invalid || sp->root_count);
+ kvm_mmu_free_page(kvm, sp);
+ } while (!list_empty(invalid_list));
+
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
+{
+ LIST_HEAD(invalid_list);
+ /*
+ * If we set the number of mmu pages to be smaller be than the
+ * number of actived pages , we must to free some mmu pages before we
+ * change the value
+ */
+
+ if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+ while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
+ !list_empty(&kvm->arch.active_mmu_pages)) {
+ struct kvm_mmu_page *page;
+
+ page = container_of(kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ }
+ goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
+ }
+
+ kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+}
+
+static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
+ int r;
+
+ pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
+ r = 0;
+
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+ pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
+ sp->role.word);
+ r = 1;
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ return r;
+}
+
+static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
+
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+ pgprintk("%s: zap %llx %x\n",
+ __func__, gfn, sp->role.word);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+}
+
+static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
+{
+ int slot = memslot_id(kvm, gfn);
+ struct kvm_mmu_page *sp = page_header(__pa(pte));
+
+ __set_bit(slot, sp->slot_bitmap);
+}
+
+static void mmu_convert_notrap(struct kvm_mmu_page *sp)
+{
+ int i;
+ u64 *pt = sp->spt;
+
+ if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
+ return;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (pt[i] == shadow_notrap_nonpresent_pte)
+ __set_spte(&pt[i], shadow_trap_nonpresent_pte);
+ }
+}
+
+/*
+ * The function is based on mtrr_type_lookup() in
+ * arch/x86/kernel/cpu/mtrr/generic.c
+ */
+static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
+ u64 start, u64 end)
+{
+ int i;
+ u64 base, mask;
+ u8 prev_match, curr_match;
+ int num_var_ranges = KVM_NR_VAR_MTRR;
+
+ if (!mtrr_state->enabled)
+ return 0xFF;
+
+ /* Make end inclusive end, instead of exclusive */
+ end--;
+
+ /* Look in fixed ranges. Just return the type as per start */
+ if (mtrr_state->have_fixed && (start < 0x100000)) {
+ int idx;
+
+ if (start < 0x80000) {
+ idx = 0;
+ idx += (start >> 16);
+ return mtrr_state->fixed_ranges[idx];
+ } else if (start < 0xC0000) {
+ idx = 1 * 8;
+ idx += ((start - 0x80000) >> 14);
+ return mtrr_state->fixed_ranges[idx];
+ } else if (start < 0x1000000) {
+ idx = 3 * 8;
+ idx += ((start - 0xC0000) >> 12);
+ return mtrr_state->fixed_ranges[idx];
+ }
+ }
+
+ /*
+ * Look in variable ranges
+ * Look of multiple ranges matching this address and pick type
+ * as per MTRR precedence
+ */
+ if (!(mtrr_state->enabled & 2))
+ return mtrr_state->def_type;
+
+ prev_match = 0xFF;
+ for (i = 0; i < num_var_ranges; ++i) {
+ unsigned short start_state, end_state;
+
+ if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
+ continue;
+
+ base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
+ (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
+ mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
+ (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
+
+ start_state = ((start & mask) == (base & mask));
+ end_state = ((end & mask) == (base & mask));
+ if (start_state != end_state)
+ return 0xFE;
+
+ if ((start & mask) != (base & mask))
+ continue;
+
+ curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
+ if (prev_match == 0xFF) {
+ prev_match = curr_match;
+ continue;
+ }
+
+ if (prev_match == MTRR_TYPE_UNCACHABLE ||
+ curr_match == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ if ((prev_match == MTRR_TYPE_WRBACK &&
+ curr_match == MTRR_TYPE_WRTHROUGH) ||
+ (prev_match == MTRR_TYPE_WRTHROUGH &&
+ curr_match == MTRR_TYPE_WRBACK)) {
+ prev_match = MTRR_TYPE_WRTHROUGH;
+ curr_match = MTRR_TYPE_WRTHROUGH;
+ }
+
+ if (prev_match != curr_match)
+ return MTRR_TYPE_UNCACHABLE;
+ }
+
+ if (prev_match != 0xFF)
+ return prev_match;
+
+ return mtrr_state->def_type;
+}
+
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u8 mtrr;
+
+ mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
+ (gfn << PAGE_SHIFT) + PAGE_SIZE);
+ if (mtrr == 0xfe || mtrr == 0xff)
+ mtrr = MTRR_TYPE_WRBACK;
+ return mtrr;
+}
+EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
+
+static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ trace_kvm_mmu_unsync_page(sp);
+ ++vcpu->kvm->stat.mmu_unsync;
+ sp->unsync = 1;
+
+ kvm_mmu_mark_parents_unsync(sp);
+ mmu_convert_notrap(sp);
+}
+
+static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (s->unsync)
+ continue;
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ __kvm_unsync_page(vcpu, s);
+ }
+}
+
+static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool can_unsync)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+ bool need_unsync = false;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!can_unsync)
+ return 1;
+
+ if (s->role.level != PT_PAGE_TABLE_LEVEL)
+ return 1;
+
+ if (!need_unsync && !s->unsync) {
+ if (!oos_shadow)
+ return 1;
+ need_unsync = true;
+ }
+ }
+ if (need_unsync)
+ kvm_unsync_pages(vcpu, gfn);
+ return 0;
+}
+
+static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned pte_access, int user_fault,
+ int write_fault, int dirty, int level,
+ gfn_t gfn, pfn_t pfn, bool speculative,
+ bool can_unsync, bool host_writable)
+{
+ u64 spte, entry = *sptep;
+ int ret = 0;
+
+ /*
+ * We don't set the accessed bit, since we sometimes want to see
+ * whether the guest actually used the pte (in order to detect
+ * demand paging).
+ */
+ spte = PT_PRESENT_MASK;
+ if (!speculative)
+ spte |= shadow_accessed_mask;
+ if (!dirty)
+ pte_access &= ~ACC_WRITE_MASK;
+ if (pte_access & ACC_EXEC_MASK)
+ spte |= shadow_x_mask;
+ else
+ spte |= shadow_nx_mask;
+ if (pte_access & ACC_USER_MASK)
+ spte |= shadow_user_mask;
+ if (level > PT_PAGE_TABLE_LEVEL)
+ spte |= PT_PAGE_SIZE_MASK;
+ if (tdp_enabled)
+ spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
+
+ if (host_writable)
+ spte |= SPTE_HOST_WRITEABLE;
+ else
+ pte_access &= ~ACC_WRITE_MASK;
+
+ spte |= (u64)pfn << PAGE_SHIFT;
+
+ if ((pte_access & ACC_WRITE_MASK)
+ || (!vcpu->arch.mmu.direct_map && write_fault
+ && !is_write_protection(vcpu) && !user_fault)) {
+
+ if (level > PT_PAGE_TABLE_LEVEL &&
+ has_wrprotected_page(vcpu->kvm, gfn, level)) {
+ ret = 1;
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ goto done;
+ }
+
+ spte |= PT_WRITABLE_MASK;
+
+ if (!vcpu->arch.mmu.direct_map
+ && !(pte_access & ACC_WRITE_MASK))
+ spte &= ~PT_USER_MASK;
+
+ /*
+ * Optimization: for pte sync, if spte was writable the hash
+ * lookup is unnecessary (and expensive). Write protection
+ * is responsibility of mmu_get_page / kvm_sync_page.
+ * Same reasoning can be applied to dirty page accounting.
+ */
+ if (!can_unsync && is_writable_pte(*sptep))
+ goto set_pte;
+
+ if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
+ pgprintk("%s: found shadow page for %llx, marking ro\n",
+ __func__, gfn);
+ ret = 1;
+ pte_access &= ~ACC_WRITE_MASK;
+ if (is_writable_pte(spte))
+ spte &= ~PT_WRITABLE_MASK;
+ }
+ }
+
+ if (pte_access & ACC_WRITE_MASK)
+ mark_page_dirty(vcpu->kvm, gfn);
+
+set_pte:
+ update_spte(sptep, spte);
+ /*
+ * If we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB.
+ */
+ if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+done:
+ return ret;
+}
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned pt_access, unsigned pte_access,
+ int user_fault, int write_fault, int dirty,
+ int *ptwrite, int level, gfn_t gfn,
+ pfn_t pfn, bool speculative,
+ bool host_writable)
+{
+ int was_rmapped = 0;
+ int rmap_count;
+
+ pgprintk("%s: spte %llx access %x write_fault %d"
+ " user_fault %d gfn %llx\n",
+ __func__, *sptep, pt_access,
+ write_fault, user_fault, gfn);
+
+ if (is_rmap_spte(*sptep)) {
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (level > PT_PAGE_TABLE_LEVEL &&
+ !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *sptep;
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, sptep);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ } else if (pfn != spte_to_pfn(*sptep)) {
+ pgprintk("hfn old %llx new %llx\n",
+ spte_to_pfn(*sptep), pfn);
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ } else
+ was_rmapped = 1;
+ }
+
+ if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
+ dirty, level, gfn, pfn, speculative, true,
+ host_writable)) {
+ if (write_fault)
+ *ptwrite = 1;
+ kvm_mmu_flush_tlb(vcpu);
+ }
+
+ pgprintk("%s: setting spte %llx\n", __func__, *sptep);
+ pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
+ is_large_pte(*sptep)? "2MB" : "4kB",
+ *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+ *sptep, sptep);
+ if (!was_rmapped && is_large_pte(*sptep))
+ ++vcpu->kvm->stat.lpages;
+
+ page_header_update_slot(vcpu->kvm, sptep, gfn);
+ if (!was_rmapped) {
+ rmap_count = rmap_add(vcpu, sptep, gfn);
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+ rmap_recycle(vcpu, sptep, gfn);
+ }
+ kvm_release_pfn_clean(pfn);
+ if (speculative) {
+ vcpu->arch.last_pte_updated = sptep;
+ vcpu->arch.last_pte_gfn = gfn;
+ }
+}
+
+static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+static struct kvm_memory_slot *
+pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
+ (no_dirty_log && slot->dirty_bitmap))
+ slot = NULL;
+
+ return slot;
+}
+
+static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+ unsigned long hva;
+
+ slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
+ if (!slot) {
+ get_page(bad_page);
+ return page_to_pfn(bad_page);
+ }
+
+ hva = gfn_to_hva_memslot(slot, gfn);
+
+ return hva_to_pfn_atomic(vcpu->kvm, hva);
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ struct page *pages[PTE_PREFETCH_NUM];
+ unsigned access = sp->role.access;
+ int i, ret;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+ if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
+ return -1;
+
+ ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+ if (ret <= 0)
+ return -1;
+
+ for (i = 0; i < ret; i++, gfn++, start++)
+ mmu_set_spte(vcpu, start, ACC_ALL,
+ access, 0, 0, 1, NULL,
+ sp->role.level, gfn,
+ page_to_pfn(pages[i]), true, true);
+
+ return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *spte, *start = NULL;
+ int i;
+
+ WARN_ON(!sp->role.direct);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+ if (!start)
+ continue;
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ break;
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since it's no accessed bit on EPT, it's no way to
+ * distinguish between actually accessed translations
+ * and prefetched, so disable pte prefetch if EPT is
+ * enabled.
+ */
+ if (!shadow_accessed_mask)
+ return;
+
+ sp = page_header(__pa(sptep));
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+ int map_writable, int level, gfn_t gfn, pfn_t pfn,
+ bool prefault)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
+ int pt_write = 0;
+ gfn_t pseudo_gfn;
+
+ for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
+ if (iterator.level == level) {
+ unsigned pte_access = ACC_ALL;
+
+ mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
+ 0, write, 1, &pt_write,
+ level, gfn, pfn, prefault, map_writable);
+ direct_pte_prefetch(vcpu, iterator.sptep);
+ ++vcpu->stat.pf_fixed;
+ break;
+ }
+
+ if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+ u64 base_addr = iterator.addr;
+
+ base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
+ pseudo_gfn = base_addr >> PAGE_SHIFT;
+ sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
+ iterator.level - 1,
+ 1, ACC_ALL, iterator.sptep);
+ if (!sp) {
+ pgprintk("nonpaging_map: ENOMEM\n");
+ kvm_release_pfn_clean(pfn);
+ return -ENOMEM;
+ }
+
+ __set_spte(iterator.sptep,
+ __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | shadow_user_mask | shadow_x_mask
+ | shadow_accessed_mask);
+ }
+ }
+ return pt_write;
+}
+
+static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
+{
+ kvm_siginfo_t info;
+
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_MCEERR_AR;
+ info.si_addr = (void *)address;
+ info.si_addr_lsb = PAGE_SHIFT;
+
+ send_sig_info(SIGBUS, (siginfo_t *)&info, tsk);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+ kvm_release_pfn_clean(pfn);
+ if (is_hwpoison_pfn(pfn)) {
+ kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
+ return 0;
+ } else if (is_fault_pfn(pfn))
+ return -EFAULT;
+
+ return 1;
+}
+
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+ gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+{
+ pfn_t pfn = *pfnp;
+ gfn_t gfn = *gfnp;
+ int level = *levelp;
+
+ /*
+ * Check if it's a transparent hugepage. If this would be an
+ * hugetlbfs page, level wouldn't be set to
+ * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+ * here.
+ */
+ if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+ level == PT_PAGE_TABLE_LEVEL &&
+ PageTransCompound(pfn_to_page(pfn)) &&
+ !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+ unsigned long mask;
+ /*
+ * mmu_notifier_retry was successful and we hold the
+ * mmu_lock here, so the pmd can't become splitting
+ * from under us, and in turn
+ * __split_huge_page_refcount() can't run from under
+ * us and we can safely transfer the refcount from
+ * PG_tail to PG_head as we switch the pfn to tail to
+ * head.
+ */
+ *levelp = level = PT_DIRECTORY_LEVEL;
+ mask = KVM_PAGES_PER_HPAGE(level) - 1;
+ VM_BUG_ON((gfn & mask) != (pfn & mask));
+ if (pfn & mask) {
+ gfn &= ~mask;
+ *gfnp = gfn;
+ kvm_release_pfn_clean(pfn);
+ pfn &= ~mask;
+ if (!get_page_unless_zero(pfn_to_page(pfn)))
+ BUG();
+ *pfnp = pfn;
+ }
+ }
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gva_t gva, pfn_t *pfn, bool write, bool *writable);
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
+ bool prefault)
+{
+ int r;
+ int level;
+ int force_pt_level;
+ pfn_t pfn;
+ unsigned long mmu_seq;
+ bool map_writable;
+
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ if (likely(!force_pt_level)) {
+ level = mapping_level(vcpu, gfn);
+ /*
+ * This path builds a PAE pagetable - so we can map
+ * 2mb pages at maximum. Therefore check if the level
+ * is larger than that.
+ */
+ if (level > PT_DIRECTORY_LEVEL)
+ level = PT_DIRECTORY_LEVEL;
+
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ } else
+ level = PT_PAGE_TABLE_LEVEL;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
+ return 0;
+
+ /* mmio */
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu, mmu_seq))
+ goto out_unlock;
+ kvm_mmu_free_some_pages(vcpu);
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
+ r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
+ prefault);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+
+ return r;
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return 0;
+}
+
+
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
+ (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+ vcpu->arch.mmu.direct_map)) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ sp = page_header(root);
+ --sp->root_count;
+ if (!sp->root_count && sp->role.invalid) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ }
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ --sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
+ }
+ vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+ }
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+}
+
+static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
+{
+ int ret = 0;
+
+ if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ unsigned i;
+
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
+ 1, ACC_ALL, NULL);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = __pa(sp->spt);
+ } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ ASSERT(!VALID_PAGE(root));
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
+ i << 30,
+ PT32_ROOT_LEVEL, 1, ACC_ALL,
+ NULL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ }
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ } else
+ BUG();
+
+ return 0;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ u64 pdptr, pm_mask;
+ gfn_t root_gfn;
+ int i;
+
+ root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
+
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+
+ /*
+ * Do we shadow a long mode page table? If so we need to
+ * write-protect the guests page table root.
+ */
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ ASSERT(!VALID_PAGE(root));
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+ 0, ACC_ALL, NULL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = root;
+ return 0;
+ }
+
+ /*
+ * We shadow a 32 bit page table. This may be a legacy 2-level
+ * or a PAE 3-level page table. In either case we need to be aware that
+ * the shadow page table may be a PAE or a long mode page table.
+ */
+ pm_mask = PT_PRESENT_MASK;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+ pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ ASSERT(!VALID_PAGE(root));
+ if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+ pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
+ if (!is_present_gpte(pdptr)) {
+ vcpu->arch.mmu.pae_root[i] = 0;
+ continue;
+ }
+ root_gfn = pdptr >> PAGE_SHIFT;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+ }
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+ PT32_ROOT_LEVEL, 0,
+ ACC_ALL, NULL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+ }
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+
+ /*
+ * If we shadow a 32 bit page table with a long mode page
+ * table we enter this path.
+ */
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.lm_root == NULL) {
+ /*
+ * The additional page necessary for this is only
+ * allocated on demand.
+ */
+
+ u64 *lm_root;
+
+ lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+ if (lm_root == NULL)
+ return 1;
+
+ lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+
+ vcpu->arch.mmu.lm_root = lm_root;
+ }
+
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+ }
+
+ return 0;
+}
+
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mmu.direct_map)
+ return mmu_alloc_direct_roots(vcpu);
+ else
+ return mmu_alloc_shadow_roots(vcpu);
+}
+
+static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (vcpu->arch.mmu.direct_map)
+ return;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ }
+ }
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_roots(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+ u32 access, struct x86_exception *exception)
+{
+ if (exception)
+ exception->error_code = 0;
+ return vaddr;
+}
+
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+ u32 access,
+ struct x86_exception *exception)
+{
+ if (exception)
+ exception->error_code = 0;
+ return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+ u32 error_code, bool prefault)
+{
+ gfn_t gfn;
+ int r;
+
+ pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ ASSERT(vcpu);
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ gfn = gva >> PAGE_SHIFT;
+
+ return nonpaging_map(vcpu, gva & PAGE_MASK,
+ error_code & PFERR_WRITE_MASK, gfn, prefault);
+}
+
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+{
+ struct kvm_arch_async_pf arch;
+
+ arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+ arch.gfn = gfn;
+ arch.direct_map = vcpu->arch.mmu.direct_map;
+ arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
+
+ return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+}
+
+static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+ kvm_event_needs_reinjection(vcpu)))
+ return false;
+
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gva_t gva, pfn_t *pfn, bool write, bool *writable)
+{
+ bool async;
+
+ *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
+
+ if (!async)
+ return false; /* *pfn has correct page already */
+
+ put_page(pfn_to_page(*pfn));
+
+ if (!prefault && can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(gva, gfn);
+ if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+ trace_kvm_async_pf_doublefault(gva, gfn);
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return true;
+ } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+ return true;
+ }
+
+ *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
+
+ return false;
+}
+
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
+ bool prefault)
+{
+ pfn_t pfn;
+ int r;
+ int level;
+ int force_pt_level;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ unsigned long mmu_seq;
+ int write = error_code & PFERR_WRITE_MASK;
+ bool map_writable;
+
+ ASSERT(vcpu);
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ if (likely(!force_pt_level)) {
+ level = mapping_level(vcpu, gfn);
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ } else
+ level = PT_PAGE_TABLE_LEVEL;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ return 0;
+
+ /* mmio */
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu, mmu_seq))
+ goto out_unlock;
+ kvm_mmu_free_some_pages(vcpu);
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
+ r = __direct_map(vcpu, gpa, write, map_writable,
+ level, gfn, pfn, prefault);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ return r;
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return 0;
+}
+
+static void nonpaging_free(struct kvm_vcpu *vcpu)
+{
+ mmu_free_roots(vcpu);
+}
+
+static int nonpaging_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ context->new_cr3 = nonpaging_new_cr3;
+ context->page_fault = nonpaging_page_fault;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->free = nonpaging_free;
+ context->prefetch_page = nonpaging_prefetch_page;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
+ context->root_level = 0;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->root_hpa = INVALID_PAGE;
+ context->direct_map = true;
+ context->nx = false;
+ return 0;
+}
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+}
+
+static void paging_new_cr3(struct kvm_vcpu *vcpu)
+{
+ pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
+ mmu_free_roots(vcpu);
+}
+
+static unsigned long get_cr3(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr3(vcpu);
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+}
+
+static void paging_free(struct kvm_vcpu *vcpu)
+{
+ nonpaging_free(vcpu);
+}
+
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ int bit7;
+
+ bit7 = (gpte >> 7) & 1;
+ return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context,
+ int level)
+{
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ u64 exb_bit_rsvd = 0;
+
+ if (!context->nx)
+ exb_bit_rsvd = rsvd_bits(63, 63);
+ switch (level) {
+ case PT32_ROOT_LEVEL:
+ /* no rsvd bits for 2 level 4K page table entries */
+ context->rsvd_bits_mask[0][1] = 0;
+ context->rsvd_bits_mask[0][0] = 0;
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+ if (!is_pse(vcpu)) {
+ context->rsvd_bits_mask[1][1] = 0;
+ break;
+ }
+
+ if (is_cpuid_PSE36())
+ /* 36bits PSE 4MB page */
+ context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ else
+ /* 32 bits PSE 4MB page */
+ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ break;
+ case PT32E_ROOT_LEVEL:
+ context->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 63) |
+ rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
+ context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PDE */
+ context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PTE */
+ context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62) |
+ rsvd_bits(13, 20); /* large page */
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+ break;
+ case PT64_ROOT_LEVEL:
+ context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+ context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 29);
+ context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 20); /* large page */
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+ break;
+ }
+}
+
+static int paging64_init_context_common(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context,
+ int level)
+{
+ context->nx = is_nx(vcpu);
+
+ reset_rsvds_bits_mask(vcpu, context, level);
+
+ ASSERT(is_pae(vcpu));
+ context->new_cr3 = paging_new_cr3;
+ context->page_fault = paging64_page_fault;
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->prefetch_page = paging64_prefetch_page;
+ context->sync_page = paging64_sync_page;
+ context->invlpg = paging64_invlpg;
+ context->free = paging_free;
+ context->root_level = level;
+ context->shadow_root_level = level;
+ context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
+ return 0;
+}
+
+static int paging64_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
+}
+
+static int paging32_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ context->nx = false;
+
+ reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
+
+ context->new_cr3 = paging_new_cr3;
+ context->page_fault = paging32_page_fault;
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->free = paging_free;
+ context->prefetch_page = paging32_prefetch_page;
+ context->sync_page = paging32_sync_page;
+ context->invlpg = paging32_invlpg;
+ context->root_level = PT32_ROOT_LEVEL;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
+ return 0;
+}
+
+static int paging32E_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
+}
+
+static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = vcpu->arch.walk_mmu;
+
+ context->base_role.word = 0;
+ context->new_cr3 = nonpaging_new_cr3;
+ context->page_fault = tdp_page_fault;
+ context->free = nonpaging_free;
+ context->prefetch_page = nonpaging_prefetch_page;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
+ context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+ context->root_hpa = INVALID_PAGE;
+ context->direct_map = true;
+ context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
+ context->get_cr3 = get_cr3;
+ context->inject_page_fault = kvm_inject_page_fault;
+ context->nx = is_nx(vcpu);
+
+ if (!is_paging(vcpu)) {
+ context->nx = false;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->root_level = 0;
+ } else if (is_long_mode(vcpu)) {
+ context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->root_level = PT64_ROOT_LEVEL;
+ } else if (is_pae(vcpu)) {
+ context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->root_level = PT32E_ROOT_LEVEL;
+ } else {
+ context->nx = false;
+ reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->root_level = PT32_ROOT_LEVEL;
+ }
+
+ return 0;
+}
+
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+ int r;
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ if (!is_paging(vcpu))
+ r = nonpaging_init_context(vcpu, context);
+ else if (is_long_mode(vcpu))
+ r = paging64_init_context(vcpu, context);
+ else if (is_pae(vcpu))
+ r = paging32E_init_context(vcpu, context);
+ else
+ r = paging32_init_context(vcpu, context);
+
+ vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+ vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+ int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
+
+ vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
+ vcpu->arch.walk_mmu->get_cr3 = get_cr3;
+ vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+
+ return r;
+}
+
+static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+ g_context->get_cr3 = get_cr3;
+ g_context->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
+ * translation of l2_gpa to l1_gpa addresses is done using the
+ * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
+ * functions between mmu and nested_mmu are swapped.
+ */
+ if (!is_paging(vcpu)) {
+ g_context->nx = false;
+ g_context->root_level = 0;
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+ } else if (is_long_mode(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
+ g_context->root_level = PT64_ROOT_LEVEL;
+ g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ } else if (is_pae(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
+ g_context->root_level = PT32E_ROOT_LEVEL;
+ g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ } else {
+ g_context->nx = false;
+ reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
+ g_context->root_level = PT32_ROOT_LEVEL;
+ g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+ }
+
+ return 0;
+}
+
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.update_pte.pfn = bad_pfn;
+
+ if (mmu_is_nested(vcpu))
+ return init_kvm_nested_mmu(vcpu);
+ else if (tdp_enabled)
+ return init_kvm_tdp_mmu(vcpu);
+ else
+ return init_kvm_softmmu(vcpu);
+}
+
+static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ /* mmu.free() should set root_hpa = INVALID_PAGE */
+ vcpu->arch.mmu.free(vcpu);
+}
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+ destroy_kvm_mmu(vcpu);
+ return init_kvm_mmu(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ goto out;
+ r = mmu_alloc_roots(vcpu);
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_roots(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ if (r)
+ goto out;
+ /* set_cr3() should ensure TLB has been flushed */
+ vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+out:
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+ mmu_free_roots(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_unload);
+
+static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *spte)
+{
+ u64 pte;
+ struct kvm_mmu_page *child;
+
+ pte = *spte;
+ if (is_shadow_present_pte(pte)) {
+ if (is_last_spte(pte, sp->role.level))
+ drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
+ else {
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, spte);
+ }
+ }
+ __set_spte(spte, shadow_trap_nonpresent_pte);
+ if (is_large_pte(pte))
+ --vcpu->kvm->stat.lpages;
+}
+
+static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *spte,
+ const void *new)
+{
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ return;
+ }
+
+ ++vcpu->kvm->stat.mmu_pte_updated;
+ if (!sp->role.cr4_pae)
+ paging32_update_pte(vcpu, sp, spte, new);
+ else
+ paging64_update_pte(vcpu, sp, spte, new);
+}
+
+static bool need_remote_flush(u64 old, u64 new)
+{
+ if (!is_shadow_present_pte(old))
+ return false;
+ if (!is_shadow_present_pte(new))
+ return true;
+ if ((old ^ new) & PT64_BASE_ADDR_MASK)
+ return true;
+ old ^= PT64_NX_MASK;
+ new ^= PT64_NX_MASK;
+ return (old & ~new & PT64_PERM_MASK) != 0;
+}
+
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
+ bool remote_flush, bool local_flush)
+{
+ if (zap_page)
+ return;
+
+ if (remote_flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ else if (local_flush)
+ kvm_mmu_flush_tlb(vcpu);
+}
+
+static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
+{
+ u64 *spte = vcpu->arch.last_pte_updated;
+
+ return !!(spte && (*spte & shadow_accessed_mask));
+}
+
+static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 gpte)
+{
+ gfn_t gfn;
+ pfn_t pfn;
+
+ if (!is_present_gpte(gpte))
+ return;
+ gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+ vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return;
+ }
+ vcpu->arch.update_pte.gfn = gfn;
+ vcpu->arch.update_pte.pfn = pfn;
+}
+
+static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u64 *spte = vcpu->arch.last_pte_updated;
+
+ if (spte
+ && vcpu->arch.last_pte_gfn == gfn
+ && shadow_accessed_mask
+ && !(*spte & shadow_accessed_mask)
+ && is_shadow_present_pte(*spte))
+ set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
+}
+
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes,
+ bool guest_initiated)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ union kvm_mmu_page_role mask = { .word = 0 };
+ struct kvm_mmu_page *sp;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
+ u64 entry, gentry;
+ u64 *spte;
+ unsigned offset = offset_in_page(gpa);
+ unsigned pte_size;
+ unsigned page_offset;
+ unsigned misaligned;
+ unsigned quadrant;
+ int level;
+ int flooded = 0;
+ int npte;
+ int r;
+ int invlpg_counter;
+ bool remote_flush, local_flush, zap_page;
+
+ zap_page = remote_flush = local_flush = false;
+
+ pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
+
+ invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode. This is nearly always true
+ * (might be false while changing modes). Note it is verified later
+ * by update_pte().
+ */
+ if ((is_pae(vcpu) && bytes == 4) || !new) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ if (is_pae(vcpu)) {
+ gpa &= ~(gpa_t)7;
+ bytes = 8;
+ }
+ r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
+ if (r)
+ gentry = 0;
+ new = (const u8 *)&gentry;
+ }
+
+ switch (bytes) {
+ case 4:
+ gentry = *(const u32 *)new;
+ break;
+ case 8:
+ gentry = *(const u64 *)new;
+ break;
+ default:
+ gentry = 0;
+ break;
+ }
+
+ mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
+ gentry = 0;
+ kvm_mmu_access_page(vcpu, gfn);
+ kvm_mmu_free_some_pages(vcpu);
+ ++vcpu->kvm->stat.mmu_pte_write;
+ trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+ if (guest_initiated) {
+ if (gfn == vcpu->arch.last_pt_write_gfn
+ && !last_updated_pte_accessed(vcpu)) {
+ ++vcpu->arch.last_pt_write_count;
+ if (vcpu->arch.last_pt_write_count >= 3)
+ flooded = 1;
+ } else {
+ vcpu->arch.last_pt_write_gfn = gfn;
+ vcpu->arch.last_pt_write_count = 1;
+ vcpu->arch.last_pte_updated = NULL;
+ }
+ }
+
+ mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
+ pte_size = sp->role.cr4_pae ? 8 : 4;
+ misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+ misaligned |= bytes < 4;
+ if (misaligned || flooded) {
+ /*
+ * Misaligned accesses are too much trouble to fix
+ * up; also, they usually indicate a page is not used
+ * as a page table.
+ *
+ * If we're seeing too many writes to a page,
+ * it may no longer be a page table, or we may be
+ * forking, in which case it is better to unmap the
+ * page.
+ */
+ pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+ gpa, bytes, sp->role.word);
+ zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
+ ++vcpu->kvm->stat.mmu_flooded;
+ continue;
+ }
+ page_offset = offset;
+ level = sp->role.level;
+ npte = 1;
+ if (!sp->role.cr4_pae) {
+ page_offset <<= 1; /* 32->64 */
+ /*
+ * A 32-bit pde maps 4MB while the shadow pdes map
+ * only 2MB. So we need to double the offset again
+ * and zap two pdes instead of one.
+ */
+ if (level == PT32_ROOT_LEVEL) {
+ page_offset &= ~7; /* kill rounding error */
+ page_offset <<= 1;
+ npte = 2;
+ }
+ quadrant = page_offset >> PAGE_SHIFT;
+ page_offset &= ~PAGE_MASK;
+ if (quadrant != sp->role.quadrant)
+ continue;
+ }
+ local_flush = true;
+ spte = &sp->spt[page_offset / sizeof(*spte)];
+ while (npte--) {
+ entry = *spte;
+ mmu_pte_write_zap_pte(vcpu, sp, spte);
+ if (gentry &&
+ !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+ & mask.word))
+ mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
+ if (!remote_flush && need_remote_flush(entry, *spte))
+ remote_flush = true;
+ ++spte;
+ }
+ }
+ mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
+ kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
+ vcpu->arch.update_pte.pfn = bad_pfn;
+ }
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+ int r;
+
+ if (vcpu->arch.mmu.direct_map)
+ return 0;
+
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
+
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+ LIST_HEAD(invalid_list);
+
+ while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
+ !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
+ struct kvm_mmu_page *sp;
+
+ sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ ++vcpu->kvm->stat.mmu_recycled;
+ }
+}
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+ void *insn, int insn_len)
+{
+ int r;
+ enum emulation_result er;
+
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+ if (r < 0)
+ goto out;
+
+ if (!r) {
+ r = 1;
+ goto out;
+ }
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ goto out;
+
+ er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
+
+ switch (er) {
+ case EMULATE_DONE:
+ return 1;
+ case EMULATE_DO_MMIO:
+ ++vcpu->stat.mmio_exits;
+ /* fall through */
+ case EMULATE_FAIL:
+ return 0;
+ default:
+ BUG();
+ }
+out:
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ vcpu->arch.mmu.invlpg(vcpu, gva);
+ kvm_mmu_flush_tlb(vcpu);
+ ++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
+void kvm_enable_tdp(void)
+{
+ tdp_enabled = true;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_tdp);
+
+void kvm_disable_tdp(void)
+{
+ tdp_enabled = false;
+}
+EXPORT_SYMBOL_GPL(kvm_disable_tdp);
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+ free_page((unsigned long)vcpu->arch.mmu.pae_root);
+ if (vcpu->arch.mmu.lm_root != NULL)
+ free_page((unsigned long)vcpu->arch.mmu.lm_root);
+}
+
+static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ int i;
+
+ ASSERT(vcpu);
+
+ /*
+ * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+ * Therefore we need to allocate shadow page tables in the first
+ * 4GB of memory, which happens to fit the DMA32 zone.
+ */
+ page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+ if (!page)
+ return -ENOMEM;
+
+ vcpu->arch.mmu.pae_root = page_address(page);
+ for (i = 0; i < 4; ++i)
+ vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+ return 0;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ return alloc_mmu_pages(vcpu);
+}
+
+int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ return init_kvm_mmu(vcpu);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
+ int i;
+ u64 *pt;
+
+ if (!test_bit(slot, sp->slot_bitmap))
+ continue;
+
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
+ pt = sp->spt;
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ /* avoid RMW */
+ if (is_writable_pte(pt[i]))
+ update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
+ }
+ kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_mmu_zap_all(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+ LIST_HEAD(invalid_list);
+
+ spin_lock(&kvm->mmu_lock);
+restart:
+ list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+ if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+ goto restart;
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *page;
+
+ page = container_of(kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)
+static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+#else
+static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
+#endif
+{
+ struct kvm *kvm;
+ struct kvm *kvm_freed = NULL;
+
+ if (nr_to_scan == 0)
+ goto out;
+
+ spin_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ int idx, freed_pages;
+ LIST_HEAD(invalid_list);
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+ if (!kvm_freed && nr_to_scan > 0 &&
+ kvm->arch.n_used_mmu_pages > 0) {
+ freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+ &invalid_list);
+ kvm_freed = kvm;
+ }
+ nr_to_scan--;
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+ }
+ if (kvm_freed)
+ list_move_tail(&kvm_freed->vm_list, &vm_list);
+
+ spin_unlock(&kvm_lock);
+
+out:
+ return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+}
+
+static struct shrinker mmu_shrinker = {
+ .shrink = mmu_shrink,
+ .seeks = DEFAULT_SEEKS * 10,
+};
+
+static void mmu_destroy_caches(void)
+{
+ if (pte_chain_cache)
+ kmem_cache_destroy(pte_chain_cache);
+ if (rmap_desc_cache)
+ kmem_cache_destroy(rmap_desc_cache);
+ if (mmu_page_header_cache)
+ kmem_cache_destroy(mmu_page_header_cache);
+}
+
+int kvm_mmu_module_init(void)
+{
+ pte_chain_cache = kmem_cache_create("kvm_pte_chain",
+ sizeof(struct kvm_pte_chain),
+ 0, 0, NULL);
+ if (!pte_chain_cache)
+ goto nomem;
+ rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
+ sizeof(struct kvm_rmap_desc),
+ 0, 0, NULL);
+ if (!rmap_desc_cache)
+ goto nomem;
+
+ mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+ sizeof(struct kvm_mmu_page),
+ 0, 0, NULL);
+ if (!mmu_page_header_cache)
+ goto nomem;
+
+ if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
+ goto nomem;
+
+ register_shrinker(&mmu_shrinker);
+
+ return 0;
+
+nomem:
+ mmu_destroy_caches();
+ return -ENOMEM;
+}
+
+/*
+ * Caculate mmu pages needed for kvm.
+ */
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+{
+ int i;
+ unsigned int nr_mmu_pages;
+ unsigned int nr_pages = 0;
+ struct kvm_memslots *slots;
+
+ slots = kvm_memslots(kvm);
+
+ for (i = 0; i < slots->nmemslots; i++)
+ nr_pages += slots->memslots[i].npages;
+
+ nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+ nr_mmu_pages = max(nr_mmu_pages,
+ (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+
+ return nr_mmu_pages;
+}
+
+static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+ unsigned len)
+{
+ if (len > buffer->len)
+ return NULL;
+ return buffer->ptr;
+}
+
+static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+ unsigned len)
+{
+ void *ret;
+
+ ret = pv_mmu_peek_buffer(buffer, len);
+ if (!ret)
+ return ret;
+ buffer->ptr += len;
+ buffer->len -= len;
+ buffer->processed += len;
+ return ret;
+}
+
+static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
+ gpa_t addr, gpa_t value)
+{
+ int bytes = 8;
+ int r;
+
+ if (!is_long_mode(vcpu) && !is_pae(vcpu))
+ bytes = 4;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ if (!emulator_write_phys(vcpu, addr, &value, bytes))
+ return -EFAULT;
+
+ return 1;
+}
+
+static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
+ return 1;
+}
+
+static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return 1;
+}
+
+static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
+ struct kvm_pv_mmu_op_buffer *buffer)
+{
+ struct kvm_mmu_op_header *header;
+
+ header = pv_mmu_peek_buffer(buffer, sizeof *header);
+ if (!header)
+ return 0;
+ switch (header->op) {
+ case KVM_MMU_OP_WRITE_PTE: {
+ struct kvm_mmu_op_write_pte *wpte;
+
+ wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
+ if (!wpte)
+ return 0;
+ return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
+ wpte->pte_val);
+ }
+ case KVM_MMU_OP_FLUSH_TLB: {
+ struct kvm_mmu_op_flush_tlb *ftlb;
+
+ ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
+ if (!ftlb)
+ return 0;
+ return kvm_pv_mmu_flush_tlb(vcpu);
+ }
+ case KVM_MMU_OP_RELEASE_PT: {
+ struct kvm_mmu_op_release_pt *rpt;
+
+ rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
+ if (!rpt)
+ return 0;
+ return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
+ }
+ default: return 0;
+ }
+}
+
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+ gpa_t addr, unsigned long *ret)
+{
+ int r;
+ struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
+
+ buffer->ptr = buffer->buf;
+ buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
+ buffer->processed = 0;
+
+ r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
+ if (r)
+ goto out;
+
+ while (buffer->len) {
+ r = kvm_pv_mmu_op_one(vcpu, buffer);
+ if (r < 0)
+ goto out;
+ if (r == 0)
+ break;
+ }
+
+ r = 1;
+out:
+ *ret = buffer->processed;
+ return r;
+}
+
+int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
+{
+ struct kvm_shadow_walk_iterator iterator;
+ int nr_sptes = 0;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry(vcpu, addr, iterator) {
+ sptes[iterator.level-1] = *iterator.sptep;
+ nr_sptes++;
+ if (!is_shadow_present_pte(*iterator.sptep))
+ break;
+ }
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ return nr_sptes;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+
+ destroy_kvm_mmu(vcpu);
+ free_mmu_pages(vcpu);
+ mmu_free_memory_caches(vcpu);
+}
+
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
+#endif
+
+void kvm_mmu_module_exit(void)
+{
+ mmu_destroy_caches();
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
+ unregister_shrinker(&mmu_shrinker);
+ mmu_audit_disable();
+}
diff --git a/linux/x86/mmu.h b/linux/x86/mmu.h
new file mode 100644
index 0000000..7086ca8
--- /dev/null
+++ b/linux/x86/mmu.h
@@ -0,0 +1,79 @@
+#ifndef __KVM_X86_MMU_H
+#define __KVM_X86_MMU_H
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_SHIFT 5
+#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_PDPE_LEVEL 3
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
+#define PFERR_FETCH_MASK (1U << 4)
+
+int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+
+static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+{
+ return kvm->arch.n_max_mmu_pages -
+ kvm->arch.n_used_mmu_pages;
+}
+
+static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
+ __kvm_mmu_free_some_pages(vcpu);
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+ if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+ return 0;
+
+ return kvm_mmu_load(vcpu);
+}
+
+static inline int is_present_gpte(unsigned long pte)
+{
+ return pte & PT_PRESENT_MASK;
+}
+
+#endif
diff --git a/linux/x86/mmu_audit.c b/linux/x86/mmu_audit.c
new file mode 100644
index 0000000..3b2d201
--- /dev/null
+++ b/linux/x86/mmu_audit.c
@@ -0,0 +1,344 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/ratelimit.h>
+
+#define audit_printk(kvm, fmt, args...) \
+ printk(KERN_ERR "audit: (%s) error: " \
+ fmt, audit_point_name[kvm->arch.audit_point], ##args)
+
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
+
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn, int level)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 *ent = sp->spt;
+
+ fn(vcpu, ent + i, level);
+
+ if (is_shadow_present_pte(ent[i]) &&
+ !is_last_spte(ent[i], level)) {
+ struct kvm_mmu_page *child;
+
+ child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(vcpu, child, fn, level - 1);
+ }
+ }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
+ return;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu, sp, fn, 2);
+ }
+ }
+
+ return;
+}
+
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+ fn(kvm, sp);
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->unsync) {
+ if (level != PT_PAGE_TABLE_LEVEL) {
+ audit_printk(vcpu->kvm, "unsync sp: %p "
+ "level = %d\n", sp, level);
+ return;
+ }
+
+ if (*sptep == shadow_notrap_nonpresent_pte) {
+ audit_printk(vcpu->kvm, "notrap spte in unsync "
+ "sp: %p\n", sp);
+ return;
+ }
+ }
+
+ if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+ audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
+ sp);
+ return;
+ }
+
+ if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+ return;
+
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return;
+ }
+
+ hpa = pfn << PAGE_SHIFT;
+ if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+ audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
+ "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+ hpa, *sptep);
+}
+
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+ unsigned long *rmapp;
+ struct kvm_mmu_page *rev_sp;
+ gfn_t gfn;
+
+
+ rev_sp = page_header(__pa(sptep));
+ gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+ if (!gfn_to_memslot(kvm, gfn)) {
+ if (!printk_ratelimit())
+ return;
+ audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
+ audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
+ (long int)(sptep - rev_sp->spt), rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+ if (!*rmapp) {
+ if (!printk_ratelimit())
+ return;
+ audit_printk(kvm, "no rmap for writable spte %llx\n",
+ *sptep);
+ dump_stack();
+ }
+}
+
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+ inspect_spte_has_rmap(vcpu->kvm, sptep);
+}
+
+static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
+ audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
+ "root.\n", sp);
+}
+
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ int i;
+
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ return;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_rmap_spte(sp->spt[i]))
+ continue;
+
+ inspect_spte_has_rmap(kvm, sp->spt + i);
+ }
+}
+
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memory_slot *slot;
+ unsigned long *rmapp;
+ u64 *spte;
+
+ if (sp->role.direct || sp->unsync || sp->role.invalid)
+ return;
+
+ slot = gfn_to_memslot(kvm, sp->gfn);
+ rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ if (is_writable_pte(*spte))
+ audit_printk(kvm, "shadow page has writable "
+ "mappings: gfn %llx role %x\n",
+ sp->gfn, sp->role.word);
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+}
+
+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ check_mappings_rmap(kvm, sp);
+ audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+ walk_all_active_sps(kvm, audit_sp);
+}
+
+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ audit_sptes_have_rmaps(vcpu, sptep, level);
+ audit_mappings(vcpu, sptep, level);
+ audit_spte_after_sync(vcpu, sptep, level);
+}
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, audit_spte);
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+ if (!__ratelimit(&ratelimit_state))
+ return;
+
+ vcpu->kvm->arch.audit_point = point;
+ audit_all_active_sps(vcpu->kvm);
+ audit_vcpu_spte(vcpu);
+}
+
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+ int ret;
+
+ if (mmu_audit)
+ return;
+
+ ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ WARN_ON(ret);
+
+ mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+ if (!mmu_audit)
+ return;
+
+ unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ tracepoint_synchronize_unregister();
+ mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ unsigned long enable;
+
+ ret = strict_strtoul(val, 10, &enable);
+ if (ret < 0)
+ return -EINVAL;
+
+ switch (enable) {
+ case 0:
+ mmu_audit_disable();
+ break;
+ case 1:
+ mmu_audit_enable();
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+ .set = mmu_audit_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/linux/x86/mmutrace.h b/linux/x86/mmutrace.h
new file mode 100644
index 0000000..b60b4fd
--- /dev/null
+++ b/linux/x86/mmutrace.h
@@ -0,0 +1,225 @@
+#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/ftrace_event.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvmmmu
+
+#define KVM_MMU_PAGE_FIELDS \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
+ __field(bool, unsync)
+
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
+ __entry->unsync = sp->unsync;
+
+#define KVM_MMU_PAGE_PRINTK() ({ \
+ const char *ret = p->buffer + p->len; \
+ static const char *access_str[] = { \
+ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
+ }; \
+ union kvm_mmu_page_role role; \
+ \
+ role.word = __entry->role; \
+ \
+ trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \
+ " %snxe root %u %s%c", \
+ __entry->gfn, role.level, \
+ role.cr4_pae ? " pae" : "", \
+ role.quadrant, \
+ role.direct ? " direct" : "", \
+ access_str[role.access], \
+ role.invalid ? " invalid" : "", \
+ role.nxe ? "" : "!", \
+ __entry->root_count, \
+ __entry->unsync ? "unsync" : "sync", 0); \
+ ret; \
+ })
+
+#define kvm_mmu_trace_pferr_flags \
+ { PFERR_PRESENT_MASK, "P" }, \
+ { PFERR_WRITE_MASK, "W" }, \
+ { PFERR_USER_MASK, "U" }, \
+ { PFERR_RSVD_MASK, "RSVD" }, \
+ { PFERR_FETCH_MASK, "F" }
+
+/*
+ * A pagetable walk has started
+ */
+TRACE_EVENT(
+ kvm_mmu_pagetable_walk,
+ TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
+ TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+
+ TP_STRUCT__entry(
+ __field(__u64, addr)
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
+ | (!!fetch_fault << 4);
+ ),
+
+ TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+
+/* We just walked a paging element */
+TRACE_EVENT(
+ kvm_mmu_paging_element,
+ TP_PROTO(u64 pte, int level),
+ TP_ARGS(pte, level),
+
+ TP_STRUCT__entry(
+ __field(__u64, pte)
+ __field(__u32, level)
+ ),
+
+ TP_fast_assign(
+ __entry->pte = pte;
+ __entry->level = level;
+ ),
+
+ TP_printk("pte %llx level %u", __entry->pte, __entry->level)
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size),
+
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
+
+ TP_printk("gpa %llx", __entry->gpa)
+);
+
+/* We set a pte accessed bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+/* We set a pte dirty bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+TRACE_EVENT(
+ kvm_mmu_walker_error,
+ TP_PROTO(u32 pferr),
+ TP_ARGS(pferr),
+
+ TP_STRUCT__entry(
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("pferr %x %s", __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+TRACE_EVENT(
+ kvm_mmu_get_page,
+ TP_PROTO(struct kvm_mmu_page *sp, bool created),
+ TP_ARGS(sp, created),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ __field(bool, created)
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ __entry->created = created;
+ ),
+
+ TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
+ __entry->created ? "new" : "existing")
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_page_class,
+
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+ kvm_mmu_audit,
+ TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
+ TP_ARGS(vcpu, audit_point),
+
+ TP_STRUCT__entry(
+ __field(struct kvm_vcpu *, vcpu)
+ __field(int, audit_point)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->audit_point = audit_point;
+ ),
+
+ TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
+ audit_point_name[__entry->audit_point])
+);
+#endif /* _TRACE_KVMMMU_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/linux/x86/paging_tmpl.h b/linux/x86/paging_tmpl.h
new file mode 100644
index 0000000..6bccc24
--- /dev/null
+++ b/linux/x86/paging_tmpl.h
@@ -0,0 +1,839 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * We need the mmu code to access both 32-bit and 64-bit guest ptes,
+ * so the code in this file is compiled twice, once per pte size.
+ */
+
+#if PTTYPE == 64
+ #define pt_element_t u64
+ #define guest_walker guest_walker64
+ #define FNAME(name) paging##64_##name
+ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #ifdef CONFIG_X86_64
+ #define PT_MAX_FULL_LEVELS 4
+ #define CMPXCHG cmpxchg
+ #else
+ #define CMPXCHG cmpxchg64
+ #define PT_MAX_FULL_LEVELS 2
+ #endif
+#elif PTTYPE == 32
+ #define pt_element_t u32
+ #define guest_walker guest_walker32
+ #define FNAME(name) paging##32_##name
+ #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
+ #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
+ #define PT_LEVEL_BITS PT32_LEVEL_BITS
+ #define PT_MAX_FULL_LEVELS 2
+ #define CMPXCHG cmpxchg
+#else
+ #error Invalid PTTYPE value
+#endif
+
+#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+ int level;
+ gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+ pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
+ gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+ unsigned pt_access;
+ unsigned pte_access;
+ gfn_t gfn;
+ struct x86_exception fault;
+};
+
+static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
+{
+ return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
+}
+
+static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
+ gfn_t table_gfn, unsigned index,
+ pt_element_t orig_pte, pt_element_t new_pte)
+{
+ pt_element_t ret;
+ pt_element_t *table;
+ struct page *page;
+
+ page = gfn_to_page(kvm, table_gfn);
+
+ table = kmap_atomic(page, KM_USER0);
+ ret = CMPXCHG(&table[index], orig_pte, new_pte);
+ kunmap_atomic(table, KM_USER0);
+
+ kvm_release_page_dirty(page);
+
+ return (ret != orig_pte);
+}
+
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+{
+ unsigned access;
+
+ access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+#if PTTYPE == 64
+ if (vcpu->arch.mmu.nx)
+ access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+ return access;
+}
+
+/*
+ * Fetch a guest pte for a guest virtual address
+ */
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gva_t addr, u32 access)
+{
+ pt_element_t pte;
+ gfn_t table_gfn;
+ unsigned index, pt_access, uninitialized_var(pte_access);
+ gpa_t pte_gpa;
+ bool eperm, present, rsvd_fault;
+ int offset, write_fault, user_fault, fetch_fault;
+
+ write_fault = access & PFERR_WRITE_MASK;
+ user_fault = access & PFERR_USER_MASK;
+ fetch_fault = access & PFERR_FETCH_MASK;
+
+ trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
+ fetch_fault);
+walk:
+ present = true;
+ eperm = rsvd_fault = false;
+ walker->level = mmu->root_level;
+ pte = mmu->get_cr3(vcpu);
+
+#if PTTYPE == 64
+ if (walker->level == PT32E_ROOT_LEVEL) {
+ pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
+ trace_kvm_mmu_paging_element(pte, walker->level);
+ if (!is_present_gpte(pte)) {
+ present = false;
+ goto error;
+ }
+ --walker->level;
+ }
+#endif
+ ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
+ (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
+
+ pt_access = ACC_ALL;
+
+ for (;;) {
+ index = PT_INDEX(addr, walker->level);
+
+ table_gfn = gpte_to_gfn(pte);
+ offset = index * sizeof(pt_element_t);
+ pte_gpa = gfn_to_gpa(table_gfn) + offset;
+ walker->table_gfn[walker->level - 1] = table_gfn;
+ walker->pte_gpa[walker->level - 1] = pte_gpa;
+
+ if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
+ offset, sizeof(pte),
+ PFERR_USER_MASK|PFERR_WRITE_MASK)) {
+ present = false;
+ break;
+ }
+
+ trace_kvm_mmu_paging_element(pte, walker->level);
+
+ if (!is_present_gpte(pte)) {
+ present = false;
+ break;
+ }
+
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
+ rsvd_fault = true;
+ break;
+ }
+
+ if (write_fault && !is_writable_pte(pte))
+ if (user_fault || is_write_protection(vcpu))
+ eperm = true;
+
+ if (user_fault && !(pte & PT_USER_MASK))
+ eperm = true;
+
+#if PTTYPE == 64
+ if (fetch_fault && (pte & PT64_NX_MASK))
+ eperm = true;
+#endif
+
+ if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
+ trace_kvm_mmu_set_accessed_bit(table_gfn, index,
+ sizeof(pte));
+ if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
+ index, pte, pte|PT_ACCESSED_MASK))
+ goto walk;
+ mark_page_dirty(vcpu->kvm, table_gfn);
+ pte |= PT_ACCESSED_MASK;
+ }
+
+ pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
+
+ walker->ptes[walker->level - 1] = pte;
+
+ if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
+ ((walker->level == PT_DIRECTORY_LEVEL) &&
+ is_large_pte(pte) &&
+ (PTTYPE == 64 || is_pse(vcpu))) ||
+ ((walker->level == PT_PDPE_LEVEL) &&
+ is_large_pte(pte) &&
+ mmu->root_level == PT64_ROOT_LEVEL)) {
+ int lvl = walker->level;
+ gpa_t real_gpa;
+ gfn_t gfn;
+ u32 ac;
+
+ gfn = gpte_to_gfn_lvl(pte, lvl);
+ gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
+
+ if (PTTYPE == 32 &&
+ walker->level == PT_DIRECTORY_LEVEL &&
+ is_cpuid_PSE36())
+ gfn += pse36_gfn_delta(pte);
+
+ ac = write_fault | fetch_fault | user_fault;
+
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
+ ac);
+ if (real_gpa == UNMAPPED_GVA)
+ return 0;
+
+ walker->gfn = real_gpa >> PAGE_SHIFT;
+
+ break;
+ }
+
+ pt_access = pte_access;
+ --walker->level;
+ }
+
+ if (!present || eperm || rsvd_fault)
+ goto error;
+
+ if (write_fault && !is_dirty_gpte(pte)) {
+ bool ret;
+
+ trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+ ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
+ pte|PT_DIRTY_MASK);
+ if (ret)
+ goto walk;
+ mark_page_dirty(vcpu->kvm, table_gfn);
+ pte |= PT_DIRTY_MASK;
+ walker->ptes[walker->level - 1] = pte;
+ }
+
+ walker->pt_access = pt_access;
+ walker->pte_access = pte_access;
+ pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
+ __func__, (u64)pte, pte_access, pt_access);
+ return 1;
+
+error:
+ walker->fault.vector = PF_VECTOR;
+ walker->fault.error_code_valid = true;
+ walker->fault.error_code = 0;
+ if (present)
+ walker->fault.error_code |= PFERR_PRESENT_MASK;
+
+ walker->fault.error_code |= write_fault | user_fault;
+
+ if (fetch_fault && mmu->nx)
+ walker->fault.error_code |= PFERR_FETCH_MASK;
+ if (rsvd_fault)
+ walker->fault.error_code |= PFERR_RSVD_MASK;
+
+ walker->fault.address = addr;
+ walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+
+ trace_kvm_mmu_walker_error(walker->fault.error_code);
+ return 0;
+}
+
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr, u32 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
+ access);
+}
+
+static int FNAME(walk_addr_nested)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr,
+ u32 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
+ addr, access);
+}
+
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ pt_element_t gpte)
+{
+ u64 nonpresent = shadow_trap_nonpresent_pte;
+
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!is_present_gpte(gpte)) {
+ if (!sp->unsync)
+ nonpresent = shadow_notrap_nonpresent_pte;
+ goto no_present;
+ }
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte, nonpresent);
+ return true;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, const void *pte)
+{
+ pt_element_t gpte;
+ unsigned pte_access;
+ pfn_t pfn;
+
+ gpte = *(const pt_element_t *)pte;
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
+ return;
+
+ pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
+ return;
+ pfn = vcpu->arch.update_pte.pfn;
+ if (is_error_pfn(pfn))
+ return;
+ if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
+ return;
+ kvm_get_pfn(pfn);
+ /*
+ * we call mmu_set_spte() with host_writable = true beacuse that
+ * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+ */
+ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+ is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
+ gpte_to_gfn(gpte), pfn, true, true);
+}
+
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ pt_element_t curr_pte;
+ gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+ u64 mask;
+ int r, index;
+
+ if (level == PT_PAGE_TABLE_LEVEL) {
+ mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+ base_gpa = pte_gpa & ~mask;
+ index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+ r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+ gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+ curr_pte = gw->prefetch_ptes[index];
+ } else
+ r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
+ &curr_pte, sizeof(curr_pte));
+
+ return r || curr_pte != gw->ptes[level - 1];
+}
+
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+ u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ pt_element_t *gptep = gw->prefetch_ptes;
+ u64 *spte;
+ int i;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ if (sp->role.direct)
+ return __direct_pte_prefetch(vcpu, sp, sptep);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ pt_element_t gpte;
+ unsigned pte_access;
+ gfn_t gfn;
+ pfn_t pfn;
+ bool dirty;
+
+ if (spte == sptep)
+ continue;
+
+ if (*spte != shadow_trap_nonpresent_pte)
+ continue;
+
+ gpte = gptep[i];
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
+ continue;
+
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ gfn = gpte_to_gfn(gpte);
+ dirty = is_dirty_gpte(gpte);
+ pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+ (pte_access & ACC_WRITE_MASK) && dirty);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ break;
+ }
+
+ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+ dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+ pfn, true, true);
+ }
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ */
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+ struct guest_walker *gw,
+ int user_fault, int write_fault, int hlevel,
+ int *ptwrite, pfn_t pfn, bool map_writable,
+ bool prefault)
+{
+ unsigned access = gw->pt_access;
+ struct kvm_mmu_page *sp = NULL;
+ bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
+ int top_level;
+ unsigned direct_access;
+ struct kvm_shadow_walk_iterator it;
+
+ if (!is_present_gpte(gw->ptes[gw->level - 1]))
+ return NULL;
+
+ direct_access = gw->pt_access & gw->pte_access;
+ if (!dirty)
+ direct_access &= ~ACC_WRITE_MASK;
+
+ top_level = vcpu->arch.mmu.root_level;
+ if (top_level == PT32E_ROOT_LEVEL)
+ top_level = PT32_ROOT_LEVEL;
+ /*
+ * Verify that the top-level gpte is still there. Since the page
+ * is a root page, it is either write protected (and cannot be
+ * changed from now on) or it is invalid (in which case, we don't
+ * really care if it changes underneath us after this point).
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, top_level))
+ goto out_gpte_changed;
+
+ for (shadow_walk_init(&it, vcpu, addr);
+ shadow_walk_okay(&it) && it.level > gw->level;
+ shadow_walk_next(&it)) {
+ gfn_t table_gfn;
+
+ drop_large_spte(vcpu, it.sptep);
+
+ sp = NULL;
+ if (!is_shadow_present_pte(*it.sptep)) {
+ table_gfn = gw->table_gfn[it.level - 2];
+ sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
+ false, access, it.sptep);
+ }
+
+ /*
+ * Verify that the gpte in the page we've just write
+ * protected is still there.
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
+ goto out_gpte_changed;
+
+ if (sp)
+ link_shadow_page(it.sptep, sp);
+ }
+
+ for (;
+ shadow_walk_okay(&it) && it.level > hlevel;
+ shadow_walk_next(&it)) {
+ gfn_t direct_gfn;
+
+ validate_direct_spte(vcpu, it.sptep, direct_access);
+
+ drop_large_spte(vcpu, it.sptep);
+
+ if (is_shadow_present_pte(*it.sptep))
+ continue;
+
+ direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+
+ sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
+ true, direct_access, it.sptep);
+ link_shadow_page(it.sptep, sp);
+ }
+
+ mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
+ user_fault, write_fault, dirty, ptwrite, it.level,
+ gw->gfn, pfn, prefault, map_writable);
+ FNAME(pte_prefetch)(vcpu, gw, it.sptep);
+
+ return it.sptep;
+
+out_gpte_changed:
+ if (sp)
+ kvm_mmu_put_page(sp, it.sptep);
+ kvm_release_pfn_clean(pfn);
+ return NULL;
+}
+
+/*
+ * Page fault handler. There are several causes for a page fault:
+ * - there is no shadow pte for the guest pte
+ * - write access through a shadow pte marked read only so that we can set
+ * the dirty bit
+ * - write access to a shadow pte marked read only so we can update the page
+ * dirty bitmap, when userspace requests it
+ * - mmio access; in this case we will never install a present shadow pte
+ * - normal guest page fault due to the guest pte marked not present, not
+ * writable, or not executable
+ *
+ * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ * a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
+ bool prefault)
+{
+ int write_fault = error_code & PFERR_WRITE_MASK;
+ int user_fault = error_code & PFERR_USER_MASK;
+ struct guest_walker walker;
+ u64 *sptep;
+ int write_pt = 0;
+ int r;
+ pfn_t pfn;
+ int level = PT_PAGE_TABLE_LEVEL;
+ int force_pt_level;
+ unsigned long mmu_seq;
+ bool map_writable;
+
+ pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ /*
+ * Look up the guest pte for the faulting address.
+ */
+ r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
+
+ /*
+ * The page is not mapped by the guest. Let the guest handle it.
+ */
+ if (!r) {
+ pgprintk("%s: guest page fault\n", __func__);
+ if (!prefault) {
+ inject_page_fault(vcpu, &walker.fault);
+ /* reset fork detector */
+ vcpu->arch.last_pt_write_count = 0;
+ }
+ return 0;
+ }
+
+ if (walker.level >= PT_DIRECTORY_LEVEL)
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+ else
+ force_pt_level = 1;
+ if (!force_pt_level) {
+ level = min(walker.level, mapping_level(vcpu, walker.gfn));
+ walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ }
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
+ &map_writable))
+ return 0;
+
+ /* mmio */
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu, mmu_seq))
+ goto out_unlock;
+
+ trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
+ kvm_mmu_free_some_pages(vcpu);
+ if (!force_pt_level)
+ transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
+ sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+ level, &write_pt, pfn, map_writable, prefault);
+ (void)sptep;
+ pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
+ sptep, *sptep, write_pt);
+
+ if (!write_pt)
+ vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+
+ ++vcpu->stat.pf_fixed;
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ return write_pt;
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return 0;
+}
+
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
+ gpa_t pte_gpa = -1;
+ int level;
+ u64 *sptep;
+ int need_flush = 0;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+
+ for_each_shadow_entry(vcpu, gva, iterator) {
+ level = iterator.level;
+ sptep = iterator.sptep;
+
+ sp = page_header(__pa(sptep));
+ if (is_last_spte(*sptep, level)) {
+ int offset, shift;
+
+ if (!sp->unsync)
+ break;
+
+ shift = PAGE_SHIFT -
+ (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
+ offset = sp->role.quadrant << shift;
+
+ pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
+ pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+ if (is_shadow_present_pte(*sptep)) {
+ if (is_large_pte(*sptep))
+ --vcpu->kvm->stat.lpages;
+ drop_spte(vcpu->kvm, sptep,
+ shadow_trap_nonpresent_pte);
+ need_flush = 1;
+ } else
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ break;
+ }
+
+ if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
+ break;
+ }
+
+ if (need_flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ atomic_inc(&vcpu->kvm->arch.invlpg_counter);
+
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ if (pte_gpa == -1)
+ return;
+
+ if (mmu_topup_memory_caches(vcpu))
+ return;
+ kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
+}
+
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
+ struct x86_exception *exception)
+{
+ struct guest_walker walker;
+ gpa_t gpa = UNMAPPED_GVA;
+ int r;
+
+ r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= vaddr & ~PAGE_MASK;
+ } else if (exception)
+ *exception = walker.fault;
+
+ return gpa;
+}
+
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+ u32 access,
+ struct x86_exception *exception)
+{
+ struct guest_walker walker;
+ gpa_t gpa = UNMAPPED_GVA;
+ int r;
+
+ r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= vaddr & ~PAGE_MASK;
+ } else if (exception)
+ *exception = walker.fault;
+
+ return gpa;
+}
+
+static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ int i, j, offset, r;
+ pt_element_t pt[256 / sizeof(pt_element_t)];
+ gpa_t pte_gpa;
+
+ if (sp->role.direct
+ || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
+ nonpaging_prefetch_page(vcpu, sp);
+ return;
+ }
+
+ pte_gpa = gfn_to_gpa(sp->gfn);
+ if (PTTYPE == 32) {
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
+ pte_gpa += offset * sizeof(pt_element_t);
+ }
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
+ r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
+ pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
+ for (j = 0; j < ARRAY_SIZE(pt); ++j)
+ if (r || is_present_gpte(pt[j]))
+ sp->spt[i+j] = shadow_trap_nonpresent_pte;
+ else
+ sp->spt[i+j] = shadow_notrap_nonpresent_pte;
+ }
+}
+
+/*
+ * Using the cached information from sp->gfns is safe because:
+ * - The spte has a reference to the struct page, so the pfn for a given gfn
+ * can't change unless all sptes pointing to it are nuked first.
+ *
+ * Note:
+ * We should flush all tlbs if spte is dropped even though guest is
+ * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
+ * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
+ * used by guest then tlbs are not flushed, so guest is allowed to access the
+ * freed pages.
+ * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
+ */
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int i, offset, nr_present;
+ bool host_writable;
+ gpa_t first_pte_gpa;
+
+ offset = nr_present = 0;
+
+ /* direct kvm_mmu_page can not be unsync. */
+ BUG_ON(sp->role.direct);
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+ first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn;
+
+ if (!is_shadow_present_pte(sp->spt[i]))
+ continue;
+
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
+
+ if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return -EINVAL;
+
+ gfn = gpte_to_gfn(gpte);
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ vcpu->kvm->tlbs_dirty++;
+ continue;
+ }
+
+ if (gfn != sp->gfns[i]) {
+ drop_spte(vcpu->kvm, &sp->spt[i],
+ shadow_trap_nonpresent_pte);
+ vcpu->kvm->tlbs_dirty++;
+ continue;
+ }
+
+ nr_present++;
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+
+ set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
+ is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
+ spte_to_pfn(sp->spt[i]), true, false,
+ host_writable);
+ }
+
+ return !nr_present;
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef PT_LEVEL_MASK
+#undef PT_LVL_ADDR_MASK
+#undef PT_LVL_OFFSET_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_lvl
+#undef CMPXCHG
diff --git a/linux/x86/svm.c b/linux/x86/svm.c
new file mode 100644
index 0000000..bd6a099
--- /dev/null
+++ b/linux/x86/svm.c
@@ -0,0 +1,3993 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "mmu.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/ftrace_event.h>
+#include <linux/slab.h>
+
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/kvm_para.h>
+
+#include <asm/virtext.h>
+#include "trace.h"
+
+#define __ex(x) __kvm_handle_fault_on_reboot(x)
+
+MODULE_INFO(version, "kvm-kmod-2.6.38-rc7");
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+#define IOPM_ALLOC_ORDER 2
+#define MSRPM_ALLOC_ORDER 1
+
+#define SEG_TYPE_LDT 2
+#define SEG_TYPE_BUSY_TSS16 3
+
+#define SVM_FEATURE_NPT (1 << 0)
+#define SVM_FEATURE_LBRV (1 << 1)
+#define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_NRIP (1 << 3)
+#define SVM_FEATURE_TSC_RATE (1 << 4)
+#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
+#define SVM_FEATURE_FLUSH_ASID (1 << 6)
+#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
+
+#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
+#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
+#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
+
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
+static bool erratum_383_found __read_mostly;
+
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+ MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+ MSR_FS_BASE,
+#endif
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+
+struct kvm_vcpu;
+
+struct nested_state {
+ struct vmcb *hsave;
+ u64 hsave_msr;
+ u64 vm_cr_msr;
+ u64 vmcb;
+
+ /* These are the merged vectors */
+ u32 *msrpm;
+
+ /* gpa pointers to the real vectors */
+ u64 vmcb_msrpm;
+ u64 vmcb_iopm;
+
+ /* A VMEXIT is required but not yet emulated */
+ bool exit_required;
+
+ /*
+ * If we vmexit during an instruction emulation we need this to restore
+ * the l1 guest rip after the emulation
+ */
+ unsigned long vmexit_rip;
+ unsigned long vmexit_rsp;
+ unsigned long vmexit_rax;
+
+ /* cache for intercepts of the guest */
+ u32 intercept_cr;
+ u32 intercept_dr;
+ u32 intercept_exceptions;
+ u64 intercept;
+
+ /* Nested Paging related state */
+ u64 nested_cr3;
+};
+
+#define MSRPM_OFFSETS 16
+static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
+struct vcpu_svm {
+ struct kvm_vcpu vcpu;
+ struct vmcb *vmcb;
+ unsigned long vmcb_pa;
+ struct svm_cpu_data *svm_data;
+ uint64_t asid_generation;
+ uint64_t sysenter_esp;
+ uint64_t sysenter_eip;
+
+ u64 next_rip;
+
+ u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+ struct {
+ u16 fs;
+ u16 gs;
+ u16 ldt;
+ u64 gs_base;
+ } host;
+
+ u32 *msrpm;
+
+ struct nested_state nested;
+
+ bool nmi_singlestep;
+
+ unsigned int3_injected;
+ unsigned long int3_rip;
+ u32 apf_reason;
+};
+
+#define MSR_INVALID 0xffffffffU
+
+static struct svm_direct_access_msrs {
+ u32 index; /* Index of the MSR */
+ bool always; /* True if intercept is always on */
+} direct_access_msrs[] = {
+ { .index = MSR_STAR, .always = true },
+ { .index = MSR_IA32_SYSENTER_CS, .always = true },
+#ifdef CONFIG_X86_64
+ { .index = MSR_GS_BASE, .always = true },
+ { .index = MSR_FS_BASE, .always = true },
+ { .index = MSR_KERNEL_GS_BASE, .always = true },
+ { .index = MSR_LSTAR, .always = true },
+ { .index = MSR_CSTAR, .always = true },
+ { .index = MSR_SYSCALL_MASK, .always = true },
+#endif
+ { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
+ { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
+ { .index = MSR_IA32_LASTINTFROMIP, .always = false },
+ { .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_INVALID, .always = false },
+};
+
+/* enable NPT for AMD64 and X86 with PAE */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static bool npt_enabled = true;
+#else
+static bool npt_enabled;
+#endif
+static int npt = 1;
+
+module_param(npt, int, S_IRUGO);
+
+static int nested = 1;
+module_param(nested, int, S_IRUGO);
+
+static void svm_flush_tlb(struct kvm_vcpu *vcpu);
+static void svm_complete_interrupts(struct vcpu_svm *svm);
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm);
+static int nested_svm_intercept(struct vcpu_svm *svm);
+static int nested_svm_vmexit(struct vcpu_svm *svm);
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code);
+
+enum {
+ VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+ pause filter count */
+ VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
+ VMCB_ASID, /* ASID */
+ VMCB_INTR, /* int_ctl, int_vector */
+ VMCB_NPT, /* npt_en, nCR3, gPAT */
+ VMCB_CR, /* CR0, CR3, CR4, EFER */
+ VMCB_DR, /* DR6, DR7 */
+ VMCB_DT, /* GDT, IDT */
+ VMCB_SEG, /* CS, DS, SS, ES, CPL */
+ VMCB_CR2, /* CR2 only */
+ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_DIRTY_MAX,
+};
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+static inline void mark_all_dirty(struct vmcb *vmcb)
+{
+ vmcb->control.clean = 0;
+}
+
+static inline void mark_all_clean(struct vmcb *vmcb)
+{
+ vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+ & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void mark_dirty(struct vmcb *vmcb, int bit)
+{
+ vmcb->control.clean &= ~(1 << bit);
+}
+
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
+static void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct nested_state *g;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->nested.hsave->control;
+ g = &svm->nested;
+
+ c->intercept_cr = h->intercept_cr | g->intercept_cr;
+ c->intercept_dr = h->intercept_dr | g->intercept_dr;
+ c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
+ c->intercept = h->intercept | g->intercept;
+}
+
+static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+{
+ if (is_guest_mode(&svm->vcpu))
+ return svm->nested.hsave;
+ else
+ return svm->vmcb;
+}
+
+static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ return vmcb->control.intercept_cr & (1U << bit);
+}
+
+static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept |= (1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept &= ~(1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+ svm->vcpu.arch.hflags |= HF_GIF_MASK;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+ svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+ return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
+}
+
+static unsigned long iopm_base;
+
+struct kvm_ldttss_desc {
+ u16 limit0;
+ u16 base0;
+ unsigned base1:8, type:5, dpl:2, p:1;
+ unsigned limit1:4, zero0:3, g:1, base2:8;
+ u32 base3;
+ u32 zero1;
+} __attribute__((packed));
+
+struct svm_cpu_data {
+ int cpu;
+
+ u64 asid_generation;
+ u32 max_asid;
+ u32 next_asid;
+ struct kvm_ldttss_desc *tss_desc;
+
+ struct page *save_area;
+};
+
+static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+static uint32_t svm_features;
+
+struct svm_init_data {
+ int cpu;
+ int r;
+};
+
+static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
+
+#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
+#define MSRS_RANGE_SIZE 2048
+#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
+
+static u32 svm_msrpm_offset(u32 msr)
+{
+ u32 offset;
+ int i;
+
+ for (i = 0; i < NUM_MSR_MAPS; i++) {
+ if (msr < msrpm_ranges[i] ||
+ msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+ continue;
+
+ offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+ offset += (i * MSRS_RANGE_SIZE); /* add range offset */
+
+ /* Now we have the u8 offset - but need the u32 offset */
+ return offset / 4;
+ }
+
+ /* MSR not in any range */
+ return MSR_INVALID;
+}
+
+#define MAX_INST_SIZE 15
+
+static inline void clgi(void)
+{
+ asm volatile (__ex(SVM_CLGI));
+}
+
+static inline void stgi(void)
+{
+ asm volatile (__ex(SVM_STGI));
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+ asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
+}
+
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+ return PT64_ROOT_LEVEL;
+#else
+ return PT32E_ROOT_LEVEL;
+#endif
+}
+
+static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ vcpu->arch.efer = efer;
+ if (!npt_enabled && !(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+
+ to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+}
+
+static int is_external_interrupt(u32 info)
+{
+ info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+ return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+}
+
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ret = 0;
+
+ if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+ ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ return ret & mask;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (mask == 0)
+ svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+ else
+ svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm->vmcb->control.next_rip != 0)
+ svm->next_rip = svm->vmcb->control.next_rip;
+
+ if (!svm->next_rip) {
+ if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
+ EMULATE_DONE)
+ printk(KERN_DEBUG "%s: NOP\n", __func__);
+ return;
+ }
+ if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+ printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
+ __func__, kvm_rip_read(vcpu), svm->next_rip);
+
+ kvm_rip_write(vcpu, svm->next_rip);
+ svm_set_interrupt_shadow(vcpu, 0);
+}
+
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code,
+ bool reinject)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If we are within a nested VM we'd better #VMEXIT and let the guest
+ * handle the exception
+ */
+ if (!reinject &&
+ nested_svm_check_exception(svm, nr, has_error_code, error_code))
+ return;
+
+ if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
+ unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+
+ /*
+ * For guest debugging where we have to reinject #BP if some
+ * INT3 is guest-owned:
+ * Emulate nRIP by moving RIP forward. Will fail if injection
+ * raises a fault that is not intercepted. Still better than
+ * failing in all cases.
+ */
+ skip_emulated_instruction(&svm->vcpu);
+ rip = kvm_rip_read(&svm->vcpu);
+ svm->int3_rip = rip + svm->vmcb->save.cs.base;
+ svm->int3_injected = rip - old_rip;
+ }
+
+ svm->vmcb->control.event_inj = nr
+ | SVM_EVTINJ_VALID
+ | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
+ svm->vmcb->control.event_inj_err = error_code;
+}
+
+static void svm_init_erratum_383(void)
+{
+ u32 low, high;
+ int err;
+ u64 val;
+
+ if (!kvm_cpu_has_amd_erratum(kvm_amd_erratum_383))
+ return;
+
+ /* Use _safe variants to not break nested virtualization */
+ val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
+ if (err)
+ return;
+
+ val |= (1ULL << 47);
+
+ low = lower_32_bits(val);
+ high = upper_32_bits(val);
+
+ kvm_native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
+
+ erratum_383_found = true;
+}
+
+static int has_svm(void)
+{
+ const char *msg;
+
+ if (!cpu_has_svm(&msg)) {
+ printk(KERN_INFO "has_svm: %s\n", msg);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void svm_hardware_disable(void *garbage)
+{
+ cpu_svm_disable();
+}
+
+static int svm_hardware_enable(void *garbage)
+{
+
+ struct svm_cpu_data *sd;
+ uint64_t efer;
+ struct kvm_desc_ptr gdt_descr;
+ struct kvm_desc_struct *gdt;
+ int me = raw_smp_processor_id();
+
+ rdmsrl(MSR_EFER, efer);
+ if (efer & EFER_SVME)
+ return -EBUSY;
+
+ if (!has_svm()) {
+ printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
+ me);
+ return -EINVAL;
+ }
+ sd = per_cpu(svm_data, me);
+
+ if (!sd) {
+ printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
+ me);
+ return -EINVAL;
+ }
+
+ sd->asid_generation = 1;
+ sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+ sd->next_asid = sd->max_asid + 1;
+
+ kvm_native_store_gdt(&gdt_descr);
+ gdt = (struct kvm_desc_struct *)gdt_descr.address;
+ sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
+
+ wrmsrl(MSR_EFER, efer | EFER_SVME);
+
+ wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
+
+ svm_init_erratum_383();
+
+ return 0;
+}
+
+static void svm_cpu_uninit(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
+
+ if (!sd)
+ return;
+
+ per_cpu(svm_data, raw_smp_processor_id()) = NULL;
+ __free_page(sd->save_area);
+ kfree(sd);
+}
+
+static int svm_cpu_init(int cpu)
+{
+ struct svm_cpu_data *sd;
+ int r;
+
+ sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+ sd->cpu = cpu;
+ sd->save_area = alloc_page(GFP_KERNEL);
+ r = -ENOMEM;
+ if (!sd->save_area)
+ goto err_1;
+
+ per_cpu(svm_data, cpu) = sd;
+
+ return 0;
+
+err_1:
+ kfree(sd);
+ return r;
+
+}
+
+static bool valid_msr_intercept(u32 index)
+{
+ int i;
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
+ if (direct_access_msrs[i].index == index)
+ return true;
+
+ return false;
+}
+
+static void set_msr_interception(u32 *msrpm, unsigned msr,
+ int read, int write)
+{
+ u8 bit_read, bit_write;
+ unsigned long tmp;
+ u32 offset;
+
+ /*
+ * If this warning triggers extend the direct_access_msrs list at the
+ * beginning of the file
+ */
+ WARN_ON(!valid_msr_intercept(msr));
+
+ offset = svm_msrpm_offset(msr);
+ bit_read = 2 * (msr & 0x0f);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
+ write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+ msrpm[offset] = tmp;
+}
+
+static void svm_vcpu_init_msrpm(u32 *msrpm)
+{
+ int i;
+
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ if (!direct_access_msrs[i].always)
+ continue;
+
+ set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+ }
+}
+
+static void add_msr_offset(u32 offset)
+{
+ int i;
+
+ for (i = 0; i < MSRPM_OFFSETS; ++i) {
+
+ /* Offset already in list? */
+ if (msrpm_offsets[i] == offset)
+ return;
+
+ /* Slot used by another offset? */
+ if (msrpm_offsets[i] != MSR_INVALID)
+ continue;
+
+ /* Add offset to list */
+ msrpm_offsets[i] = offset;
+
+ return;
+ }
+
+ /*
+ * If this BUG triggers the msrpm_offsets table has an overflow. Just
+ * increase MSRPM_OFFSETS in this case.
+ */
+ BUG();
+}
+
+static void init_msrpm_offsets(void)
+{
+ int i;
+
+ memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 offset;
+
+ offset = svm_msrpm_offset(direct_access_msrs[i].index);
+ BUG_ON(offset == MSR_INVALID);
+
+ add_msr_offset(offset);
+ }
+}
+
+static void svm_enable_lbrv(struct vcpu_svm *svm)
+{
+ u32 *msrpm = svm->msrpm;
+
+ svm->vmcb->control.lbr_ctl = 1;
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+
+static void svm_disable_lbrv(struct vcpu_svm *svm)
+{
+ u32 *msrpm = svm->msrpm;
+
+ svm->vmcb->control.lbr_ctl = 0;
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+}
+
+static __init int svm_hardware_setup(void)
+{
+ int cpu;
+ struct page *iopm_pages;
+ void *iopm_va;
+ int r;
+
+ iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+
+ if (!iopm_pages)
+ return -ENOMEM;
+
+ iopm_va = page_address(iopm_pages);
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+ init_msrpm_offsets();
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
+ if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+ kvm_enable_efer_bits(EFER_FFXSR);
+
+ if (nested) {
+ printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ }
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
+ svm_features = cpuid_edx(SVM_CPUID_FUNC);
+
+ if (!boot_cpu_has(X86_FEATURE_NPT))
+ npt_enabled = false;
+
+ if (npt_enabled && !npt) {
+ printk(KERN_INFO "kvm: Nested Paging disabled\n");
+ npt_enabled = false;
+ }
+
+ if (npt_enabled) {
+ printk(KERN_INFO "kvm: Nested Paging enabled\n");
+ kvm_enable_tdp();
+ } else
+ kvm_disable_tdp();
+
+ return 0;
+
+err:
+ __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
+ iopm_base = 0;
+ return r;
+}
+
+static __exit void svm_hardware_unsetup(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ svm_cpu_uninit(cpu);
+
+ __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+ iopm_base = 0;
+}
+
+static void init_seg(struct vmcb_seg *seg)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
+ SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | type;
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 g_tsc_offset = 0;
+
+ if (is_guest_mode(vcpu)) {
+ g_tsc_offset = svm->vmcb->control.tsc_offset -
+ svm->nested.hsave->control.tsc_offset;
+ svm->nested.hsave->control.tsc_offset = offset;
+ }
+
+ svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+}
+
+static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.tsc_offset += adjustment;
+ if (is_guest_mode(vcpu))
+ svm->nested.hsave->control.tsc_offset += adjustment;
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+}
+
+static void init_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ svm->vcpu.fpu_active = 1;
+ svm->vcpu.arch.hflags = 0;
+
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR3_READ);
+ set_cr_intercept(svm, INTERCEPT_CR4_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ set_dr_intercept(svm, INTERCEPT_DR0_READ);
+ set_dr_intercept(svm, INTERCEPT_DR1_READ);
+ set_dr_intercept(svm, INTERCEPT_DR2_READ);
+ set_dr_intercept(svm, INTERCEPT_DR3_READ);
+ set_dr_intercept(svm, INTERCEPT_DR4_READ);
+ set_dr_intercept(svm, INTERCEPT_DR5_READ);
+ set_dr_intercept(svm, INTERCEPT_DR6_READ);
+ set_dr_intercept(svm, INTERCEPT_DR7_READ);
+
+ set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
+
+ set_exception_intercept(svm, PF_VECTOR);
+ set_exception_intercept(svm, UD_VECTOR);
+ set_exception_intercept(svm, MC_VECTOR);
+
+ set_intercept(svm, INTERCEPT_INTR);
+ set_intercept(svm, INTERCEPT_NMI);
+ set_intercept(svm, INTERCEPT_SMI);
+ set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ set_intercept(svm, INTERCEPT_CPUID);
+ set_intercept(svm, INTERCEPT_INVD);
+ set_intercept(svm, INTERCEPT_HLT);
+ set_intercept(svm, INTERCEPT_INVLPG);
+ set_intercept(svm, INTERCEPT_INVLPGA);
+ set_intercept(svm, INTERCEPT_IOIO_PROT);
+ set_intercept(svm, INTERCEPT_MSR_PROT);
+ set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ set_intercept(svm, INTERCEPT_SHUTDOWN);
+ set_intercept(svm, INTERCEPT_VMRUN);
+ set_intercept(svm, INTERCEPT_VMMCALL);
+ set_intercept(svm, INTERCEPT_VMLOAD);
+ set_intercept(svm, INTERCEPT_VMSAVE);
+ set_intercept(svm, INTERCEPT_STGI);
+ set_intercept(svm, INTERCEPT_CLGI);
+ set_intercept(svm, INTERCEPT_SKINIT);
+ set_intercept(svm, INTERCEPT_WBINVD);
+ set_intercept(svm, INTERCEPT_MONITOR);
+ set_intercept(svm, INTERCEPT_MWAIT);
+ set_intercept(svm, INTERCEPT_XSETBV);
+
+ control->iopm_base_pa = iopm_base;
+ control->msrpm_base_pa = __pa(svm->msrpm);
+ control->int_ctl = V_INTR_MASKING_MASK;
+
+ init_seg(&save->es);
+ init_seg(&save->ss);
+ init_seg(&save->ds);
+ init_seg(&save->fs);
+ init_seg(&save->gs);
+
+ save->cs.selector = 0xf000;
+ /* Executable/Readable Code Segment */
+ save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
+ SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
+ save->cs.limit = 0xffff;
+ /*
+ * cs.base should really be 0xffff0000, but vmx can't handle that, so
+ * be consistent with it.
+ *
+ * Replace when we have real mode working for vmx.
+ */
+ save->cs.base = 0xf0000;
+
+ save->gdtr.limit = 0xffff;
+ save->idtr.limit = 0xffff;
+
+ init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
+ init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
+
+ svm_set_efer(&svm->vcpu, 0);
+ save->dr6 = 0xffff0ff0;
+ save->dr7 = 0x400;
+ save->rflags = 2;
+ save->rip = 0x0000fff0;
+ svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
+
+ /*
+ * This is the guest-visible cr0 value.
+ * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
+ */
+ svm->vcpu.arch.cr0 = 0;
+ (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+
+ save->cr4 = X86_CR4_PAE;
+ /* rdx = ?? */
+
+ if (npt_enabled) {
+ /* Setup VMCB for Nested Paging */
+ control->nested_ctl = 1;
+ clr_intercept(svm, INTERCEPT_TASK_SWITCH);
+ clr_intercept(svm, INTERCEPT_INVLPG);
+ clr_exception_intercept(svm, PF_VECTOR);
+ clr_cr_intercept(svm, INTERCEPT_CR3_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ save->g_pat = 0x0007040600070406ULL;
+ save->cr3 = 0;
+ save->cr4 = 0;
+ }
+ svm->asid_generation = 0;
+
+ svm->nested.vmcb = 0;
+ svm->vcpu.arch.hflags = 0;
+
+ if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ control->pause_filter_count = 3000;
+ set_intercept(svm, INTERCEPT_PAUSE);
+ }
+
+ mark_all_dirty(svm->vmcb);
+
+ enable_gif(svm);
+}
+
+static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ init_vmcb(svm);
+
+ if (!kvm_vcpu_is_bsp(vcpu)) {
+ kvm_rip_write(vcpu, 0);
+ svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
+ svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
+ }
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
+
+ return 0;
+}
+
+static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
+{
+ struct vcpu_svm *svm;
+ struct page *page;
+ struct page *msrpm_pages;
+ struct page *hsave_page;
+ struct page *nested_msrpm_pages;
+ int err;
+
+ svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ if (!svm) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = kvm_vcpu_init(&svm->vcpu, kvm, id);
+ if (err)
+ goto free_svm;
+
+ err = -ENOMEM;
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ goto uninit;
+
+ msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ if (!msrpm_pages)
+ goto free_page1;
+
+ nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ if (!nested_msrpm_pages)
+ goto free_page2;
+
+ hsave_page = alloc_page(GFP_KERNEL);
+ if (!hsave_page)
+ goto free_page3;
+
+ svm->nested.hsave = page_address(hsave_page);
+
+ svm->msrpm = page_address(msrpm_pages);
+ svm_vcpu_init_msrpm(svm->msrpm);
+
+ svm->nested.msrpm = page_address(nested_msrpm_pages);
+ svm_vcpu_init_msrpm(svm->nested.msrpm);
+
+ svm->vmcb = page_address(page);
+ clear_page(svm->vmcb);
+ svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
+ svm->asid_generation = 0;
+ init_vmcb(svm);
+ kvm_write_tsc(&svm->vcpu, 0);
+
+ err = fx_init(&svm->vcpu);
+ if (err)
+ goto free_page4;
+
+ svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_bsp(&svm->vcpu))
+ svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+
+ return &svm->vcpu;
+
+free_page4:
+ __free_page(hsave_page);
+free_page3:
+ __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
+free_page2:
+ __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
+free_page1:
+ __free_page(page);
+uninit:
+ kvm_vcpu_uninit(&svm->vcpu);
+free_svm:
+ kmem_cache_free(kvm_vcpu_cache, svm);
+out:
+ return ERR_PTR(err);
+}
+
+static void svm_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+ __free_page(virt_to_page(svm->nested.hsave));
+ __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
+ kvm_vcpu_uninit(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, svm);
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int i;
+
+ if (unlikely(cpu != vcpu->cpu)) {
+ svm->asid_generation = 0;
+ mark_all_dirty(svm->vmcb);
+ }
+
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+#endif
+ savesegment(fs, svm->host.fs);
+ savesegment(gs, svm->host.gs);
+ svm->host.ldt = kvm_read_ldt();
+
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+}
+
+static void svm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int i;
+
+ ++vcpu->stat.host_state_reload;
+ kvm_load_ldt(svm->host.ldt);
+#ifdef CONFIG_X86_64
+ loadsegment(fs, svm->host.fs);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+ load_gs_index(svm->host.gs);
+#else
+ loadsegment(gs, svm->host.gs);
+#endif
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+}
+
+static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ return to_svm(vcpu)->vmcb->save.rflags;
+}
+
+static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ to_svm(vcpu)->vmcb->save.rflags = rflags;
+}
+
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ switch (reg) {
+ case VCPU_EXREG_PDPTR:
+ BUG_ON(!npt_enabled);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void svm_set_vintr(struct vcpu_svm *svm)
+{
+ set_intercept(svm, INTERCEPT_VINTR);
+}
+
+static void svm_clear_vintr(struct vcpu_svm *svm)
+{
+ clr_intercept(svm, INTERCEPT_VINTR);
+}
+
+static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ switch (seg) {
+ case VCPU_SREG_CS: return &save->cs;
+ case VCPU_SREG_DS: return &save->ds;
+ case VCPU_SREG_ES: return &save->es;
+ case VCPU_SREG_FS: return &save->fs;
+ case VCPU_SREG_GS: return &save->gs;
+ case VCPU_SREG_SS: return &save->ss;
+ case VCPU_SREG_TR: return &save->tr;
+ case VCPU_SREG_LDTR: return &save->ldtr;
+ }
+ BUG();
+ return NULL;
+}
+
+static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ return s->base;
+}
+
+static void svm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ var->base = s->base;
+ var->limit = s->limit;
+ var->selector = s->selector;
+ var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
+ var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
+ var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
+ var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
+ var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
+ var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
+ var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+
+ /*
+ * AMD's VMCB does not have an explicit unusable field, so emulate it
+ * for cross vendor migration purposes by "not present"
+ */
+ var->unusable = !var->present || (var->type == 0);
+
+ switch (seg) {
+ case VCPU_SREG_CS:
+ /*
+ * SVM always stores 0 for the 'G' bit in the CS selector in
+ * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+ * Intel's VMENTRY has a check on the 'G' bit.
+ */
+ var->g = s->limit > 0xfffff;
+ break;
+ case VCPU_SREG_TR:
+ /*
+ * Work around a bug where the busy flag in the tr selector
+ * isn't exposed
+ */
+ var->type |= 0x2;
+ break;
+ case VCPU_SREG_DS:
+ case VCPU_SREG_ES:
+ case VCPU_SREG_FS:
+ case VCPU_SREG_GS:
+ /*
+ * The accessed bit must always be set in the segment
+ * descriptor cache, although it can be cleared in the
+ * descriptor, the cached bit always remains at 1. Since
+ * Intel has a check on this, set it here to support
+ * cross-vendor migration.
+ */
+ if (!var->unusable)
+ var->type |= 0x1;
+ break;
+ case VCPU_SREG_SS:
+ /*
+ * On AMD CPUs sometimes the DB bit in the segment
+ * descriptor is left as 1, although the whole segment has
+ * been made unusable. Clear it here to pass an Intel VMX
+ * entry check when cross vendor migrating.
+ */
+ if (var->unusable)
+ var->db = 0;
+ break;
+ }
+}
+
+static int svm_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ return save->cpl;
+}
+
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.idtr.limit;
+ dt->address = svm->vmcb->save.idtr.base;
+}
+
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.idtr.limit = dt->size;
+ svm->vmcb->save.idtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.gdtr.limit;
+ dt->address = svm->vmcb->save.gdtr.base;
+}
+
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.gdtr.limit = dt->size;
+ svm->vmcb->save.gdtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_decache_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
+static void update_cr0_intercept(struct vcpu_svm *svm)
+{
+ ulong gcr0 = svm->vcpu.arch.cr0;
+ u64 *hcr0 = &svm->vmcb->save.cr0;
+
+ if (!svm->vcpu.fpu_active)
+ *hcr0 |= SVM_CR0_SELECTIVE_MASK;
+ else
+ *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+ | (gcr0 & SVM_CR0_SELECTIVE_MASK);
+
+ mark_dirty(svm->vmcb, VMCB_CR);
+
+ if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
+ clr_cr_intercept(svm, INTERCEPT_CR0_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
+}
+
+static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu)) {
+ /*
+ * We are here because we run in nested mode, the host kvm
+ * intercepts cr0 writes but the l1 hypervisor does not.
+ * But the L1 hypervisor may intercept selective cr0 writes.
+ * This needs to be checked here.
+ */
+ unsigned long old, new;
+
+ /* Remove bits that would trigger a real cr0 write intercept */
+ old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
+ new = cr0 & SVM_CR0_SELECTIVE_MASK;
+
+ if (old == new) {
+ /* cr0 write with ts and mp unchanged */
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
+ svm->nested.vmexit_rip = kvm_rip_read(vcpu);
+ svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ return;
+ }
+ }
+ }
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer |= EFER_LMA;
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+ }
+
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer &= ~EFER_LMA;
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+ }
+ }
+#endif
+ vcpu->arch.cr0 = cr0;
+
+ if (!npt_enabled)
+ cr0 |= X86_CR0_PG | X86_CR0_WP;
+
+ if (!vcpu->fpu_active)
+ cr0 |= X86_CR0_TS;
+ /*
+ * re-enable caching here because the QEMU bios
+ * does not do it - this results in some delay at
+ * reboot
+ */
+ cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+ svm->vmcb->save.cr0 = cr0;
+ mark_dirty(svm->vmcb, VMCB_CR);
+ update_cr0_intercept(svm);
+}
+
+static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+ unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
+
+ if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
+ svm_flush_tlb(vcpu);
+
+ vcpu->arch.cr4 = cr4;
+ if (!npt_enabled)
+ cr4 |= X86_CR4_PAE;
+ cr4 |= host_cr4_mce;
+ to_svm(vcpu)->vmcb->save.cr4 = cr4;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+}
+
+static void svm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ s->base = var->base;
+ s->limit = var->limit;
+ s->selector = var->selector;
+ if (var->unusable)
+ s->attrib = 0;
+ else {
+ s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+ s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+ s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+ s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
+ s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+ s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+ s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+ s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
+ }
+ if (seg == VCPU_SREG_CS)
+ svm->vmcb->save.cpl
+ = (svm->vmcb->save.cs.attrib
+ >> SVM_SELECTOR_DPL_SHIFT) & 3;
+
+ mark_dirty(svm->vmcb, VMCB_SEG);
+}
+
+static void update_db_intercept(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ clr_exception_intercept(svm, DB_VECTOR);
+ clr_exception_intercept(svm, BP_VECTOR);
+
+ if (svm->nmi_singlestep)
+ set_exception_intercept(svm, DB_VECTOR);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+ if (vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ set_exception_intercept(svm, DB_VECTOR);
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ set_exception_intercept(svm, BP_VECTOR);
+ } else
+ vcpu->guest_debug = 0;
+}
+
+static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
+ else
+ svm->vmcb->save.dr7 = vcpu->arch.dr7;
+
+ mark_dirty(svm->vmcb, VMCB_DR);
+
+ update_db_intercept(vcpu);
+}
+
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
+{
+ if (sd->next_asid > sd->max_asid) {
+ ++sd->asid_generation;
+ sd->next_asid = 1;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+ }
+
+ svm->asid_generation = sd->asid_generation;
+ svm->vmcb->control.asid = sd->next_asid++;
+
+ mark_dirty(svm->vmcb, VMCB_ASID);
+}
+
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.dr7 = value;
+ mark_dirty(svm->vmcb, VMCB_DR);
+}
+
+static int pf_interception(struct vcpu_svm *svm)
+{
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u32 error_code;
+ int r = 1;
+
+ switch (svm->apf_reason) {
+ default:
+ error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
+ kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+ r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ svm->vmcb->control.insn_bytes,
+ svm->vmcb->control.insn_len);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ svm->apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wait(fault_address);
+ local_irq_enable();
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ svm->apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wake(fault_address);
+ local_irq_enable();
+ break;
+ }
+ return r;
+}
+
+static int db_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
+ if (!(svm->vcpu.guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+ !svm->nmi_singlestep) {
+ kvm_queue_exception(&svm->vcpu, DB_VECTOR);
+ return 1;
+ }
+
+ if (svm->nmi_singlestep) {
+ svm->nmi_singlestep = false;
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
+ svm->vmcb->save.rflags &=
+ ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+ update_db_intercept(&svm->vcpu);
+ }
+
+ if (svm->vcpu.guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc =
+ svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int bp_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = BP_VECTOR;
+ return 0;
+}
+
+static int ud_interception(struct vcpu_svm *svm)
+{
+ int er;
+
+ er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
+ if (er != EMULATE_DONE)
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+}
+
+static void svm_fpu_activate(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ clr_exception_intercept(svm, NM_VECTOR);
+
+ svm->vcpu.fpu_active = 1;
+ update_cr0_intercept(svm);
+}
+
+static int nm_interception(struct vcpu_svm *svm)
+{
+ svm_fpu_activate(&svm->vcpu);
+ return 1;
+}
+
+static bool is_erratum_383(void)
+{
+ int err, i;
+ u64 value;
+
+ if (!erratum_383_found)
+ return false;
+
+ value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
+ if (err)
+ return false;
+
+ /* Bit 62 may or may not be set for this mce */
+ value &= ~(1ULL << 62);
+
+ if (value != 0xb600000000010015ULL)
+ return false;
+
+ /* Clear MCi_STATUS registers */
+ for (i = 0; i < 6; ++i)
+ kvm_native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
+
+ value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
+ if (!err) {
+ u32 low, high;
+
+ value &= ~(1ULL << 2);
+ low = lower_32_bits(value);
+ high = upper_32_bits(value);
+
+ kvm_native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
+ }
+
+ /* Flush tlb to evict multi-match entries */
+ __flush_tlb_all();
+
+ return true;
+}
+
+static void svm_handle_mce(struct vcpu_svm *svm)
+{
+ if (is_erratum_383()) {
+ /*
+ * Erratum 383 triggered. Guest state is corrupt so kill the
+ * guest.
+ */
+ pr_err("KVM: Guest triggered AMD Erratum 383\n");
+
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+
+ return;
+ }
+
+ /*
+ * On an #MC intercept the MCE handler is not called automatically in
+ * the host. So do it by hand here.
+ */
+ asm volatile (
+ "int $0x12\n");
+ /* not sure if we ever come back to this point */
+
+ return;
+}
+
+static int mc_interception(struct vcpu_svm *svm)
+{
+ return 1;
+}
+
+static int shutdown_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
+ /*
+ * VMCB is undefined after a SHUTDOWN intercept
+ * so reinitialize it.
+ */
+ clear_page(svm->vmcb);
+ init_vmcb(svm);
+
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+static int io_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
+ int size, in, string;
+ unsigned port;
+
+ ++svm->vcpu.stat.io_exits;
+ string = (io_info & SVM_IOIO_STR_MASK) != 0;
+ in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+ if (string || in)
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+
+ port = io_info >> 16;
+ size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+ svm->next_rip = svm->vmcb->control.exit_info_2;
+ skip_emulated_instruction(&svm->vcpu);
+
+ return kvm_fast_pio_out(vcpu, size, port);
+}
+
+static int nmi_interception(struct vcpu_svm *svm)
+{
+ return 1;
+}
+
+static int intr_interception(struct vcpu_svm *svm)
+{
+ ++svm->vcpu.stat.irq_exits;
+ return 1;
+}
+
+static int nop_on_interception(struct vcpu_svm *svm)
+{
+ return 1;
+}
+
+static int halt_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
+ skip_emulated_instruction(&svm->vcpu);
+ return kvm_emulate_halt(&svm->vcpu);
+}
+
+static int vmmcall_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+ kvm_emulate_hypercall(&svm->vcpu);
+ return 1;
+}
+
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.nested_cr3;
+}
+
+static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
+ unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.nested_cr3 = root;
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ svm_flush_tlb(vcpu);
+}
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = fault->error_code;
+ svm->vmcb->control.exit_info_2 = fault->address;
+
+ nested_svm_vmexit(svm);
+}
+
+static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+
+ vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
+ vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.mmu.shadow_root_level = get_npt_level();
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+
+ return r;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
+static int nested_svm_check_permissions(struct vcpu_svm *svm)
+{
+ if (!(svm->vcpu.arch.efer & EFER_SVME)
+ || !is_paging(&svm->vcpu)) {
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (svm->vmcb->save.cpl) {
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code)
+{
+ int vmexit;
+
+ if (!is_guest_mode(&svm->vcpu))
+ return 0;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = error_code;
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+
+ vmexit = nested_svm_intercept(svm);
+ if (vmexit == NESTED_EXIT_DONE)
+ svm->nested.exit_required = true;
+
+ return vmexit;
+}
+
+/* This function returns true if it is save to enable the irq window */
+static inline bool nested_svm_intr(struct vcpu_svm *svm)
+{
+ if (!is_guest_mode(&svm->vcpu))
+ return true;
+
+ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+ return true;
+
+ if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
+ return false;
+
+ /*
+ * if vmexit was already requested (by intercepted exception
+ * for instance) do not overwrite it with "external interrupt"
+ * vmexit.
+ */
+ if (svm->nested.exit_required)
+ return false;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ if (svm->nested.intercept & 1ULL) {
+ /*
+ * The #vmexit can't be emulated here directly because this
+ * code path runs with irqs and preemtion disabled. A
+ * #vmexit emulation might sleep. Only signal request for
+ * the #vmexit here.
+ */
+ svm->nested.exit_required = true;
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ return false;
+ }
+
+ return true;
+}
+
+/* This function returns true if it is save to enable the nmi window */
+static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+{
+ if (!is_guest_mode(&svm->vcpu))
+ return true;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
+ return true;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+ svm->nested.exit_required = true;
+
+ return false;
+}
+
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
+{
+ struct page *page;
+
+ might_sleep();
+
+ page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
+ if (is_error_page(page))
+ goto error;
+
+ *_page = page;
+
+ return kmap(page);
+
+error:
+ kvm_release_page_clean(page);
+ kvm_inject_gp(&svm->vcpu, 0);
+
+ return NULL;
+}
+
+static void nested_svm_unmap(struct page *page)
+{
+ kunmap(page);
+ kvm_release_page_dirty(page);
+}
+
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port;
+ u8 val, bit;
+ u64 gpa;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
+
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ gpa = svm->nested.vmcb_iopm + (port / 8);
+ bit = port % 8;
+ val = 0;
+
+ if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
+ val &= (1 << bit);
+
+ return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+ u32 offset, msr, value;
+ int write, mask;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return NESTED_EXIT_HOST;
+
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ offset = svm_msrpm_offset(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+ mask = 1 << ((2 * (msr & 0xf)) + write);
+
+ if (offset == MSR_INVALID)
+ return NESTED_EXIT_DONE;
+
+ /* Offset is in 32 bit units but need in 8 bit units */
+ offset *= 4;
+
+ if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+ return NESTED_EXIT_DONE;
+
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_exit_special(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ switch (exit_code) {
+ case SVM_EXIT_INTR:
+ case SVM_EXIT_NMI:
+ case SVM_EXIT_EXCP_BASE + MC_VECTOR:
+ return NESTED_EXIT_HOST;
+ case SVM_EXIT_NPF:
+ /* For now we are always handling NPFs when using them */
+ if (npt_enabled)
+ return NESTED_EXIT_HOST;
+ break;
+ case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+ /* When we're shadowing, trap PFs, but not async PF */
+ if (!npt_enabled && svm->apf_reason == 0)
+ return NESTED_EXIT_HOST;
+ break;
+ case SVM_EXIT_EXCP_BASE + NM_VECTOR:
+ nm_interception(svm);
+ break;
+ default:
+ break;
+ }
+
+ return NESTED_EXIT_CONTINUE;
+}
+
+/*
+ * If this function returns true, this #vmexit was already handled
+ */
+static int nested_svm_intercept(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ int vmexit = NESTED_EXIT_HOST;
+
+ switch (exit_code) {
+ case SVM_EXIT_MSR:
+ vmexit = nested_svm_exit_handled_msr(svm);
+ break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
+ case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
+ if (svm->nested.intercept_cr & bit)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
+ if (svm->nested.intercept_dr & bit)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+ if (svm->nested.intercept_exceptions & excp_bits)
+ vmexit = NESTED_EXIT_DONE;
+ /* async page fault always cause vmexit */
+ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
+ svm->apf_reason != 0)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_ERR: {
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ default: {
+ u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
+ if (svm->nested.intercept & exit_bits)
+ vmexit = NESTED_EXIT_DONE;
+ }
+ }
+
+ return vmexit;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ int vmexit;
+
+ vmexit = nested_svm_intercept(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ nested_svm_vmexit(svm);
+
+ return vmexit;
+}
+
+static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
+{
+ struct vmcb_control_area *dst = &dst_vmcb->control;
+ struct vmcb_control_area *from = &from_vmcb->control;
+
+ dst->intercept_cr = from->intercept_cr;
+ dst->intercept_dr = from->intercept_dr;
+ dst->intercept_exceptions = from->intercept_exceptions;
+ dst->intercept = from->intercept;
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->lbr_ctl = from->lbr_ctl;
+}
+
+static int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+
+ trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
+ vmcb->control.exit_info_1,
+ vmcb->control.exit_info_2,
+ vmcb->control.exit_int_info,
+ vmcb->control.exit_int_info_err);
+
+ nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
+ if (!nested_vmcb)
+ return 1;
+
+ /* Exit Guest-Mode */
+ leave_guest_mode(&svm->vcpu);
+ svm->nested.vmcb = 0;
+
+ /* Give the current vmcb to the guest */
+ disable_gif(svm);
+
+ nested_vmcb->save.es = vmcb->save.es;
+ nested_vmcb->save.cs = vmcb->save.cs;
+ nested_vmcb->save.ss = vmcb->save.ss;
+ nested_vmcb->save.ds = vmcb->save.ds;
+ nested_vmcb->save.gdtr = vmcb->save.gdtr;
+ nested_vmcb->save.idtr = vmcb->save.idtr;
+ nested_vmcb->save.efer = svm->vcpu.arch.efer;
+ nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ nested_vmcb->save.cr2 = vmcb->save.cr2;
+ nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
+ nested_vmcb->save.rflags = vmcb->save.rflags;
+ nested_vmcb->save.rip = vmcb->save.rip;
+ nested_vmcb->save.rsp = vmcb->save.rsp;
+ nested_vmcb->save.rax = vmcb->save.rax;
+ nested_vmcb->save.dr7 = vmcb->save.dr7;
+ nested_vmcb->save.dr6 = vmcb->save.dr6;
+ nested_vmcb->save.cpl = vmcb->save.cpl;
+
+ nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
+ nested_vmcb->control.int_vector = vmcb->control.int_vector;
+ nested_vmcb->control.int_state = vmcb->control.int_state;
+ nested_vmcb->control.exit_code = vmcb->control.exit_code;
+ nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
+ nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
+ nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
+ nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
+ nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+ nested_vmcb->control.next_rip = vmcb->control.next_rip;
+
+ /*
+ * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
+ * to make sure that we do not lose injected events. So check event_inj
+ * here and copy it to exit_int_info if it is valid.
+ * Exit_int_info and event_inj can't be both valid because the case
+ * below only happens on a VMRUN instruction intercept which has
+ * no valid exit_int_info set.
+ */
+ if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
+ struct vmcb_control_area *nc = &nested_vmcb->control;
+
+ nc->exit_int_info = vmcb->control.event_inj;
+ nc->exit_int_info_err = vmcb->control.event_inj_err;
+ }
+
+ nested_vmcb->control.tlb_ctl = 0;
+ nested_vmcb->control.event_inj = 0;
+ nested_vmcb->control.event_inj_err = 0;
+
+ /* We always set V_INTR_MASKING and remember the old value in hflags */
+ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+ nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+
+ /* Restore the original control entries */
+ copy_vmcb_control_area(vmcb, hsave);
+
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ svm->nested.nested_cr3 = 0;
+
+ /* Restore selected save entries */
+ svm->vmcb->save.es = hsave->save.es;
+ svm->vmcb->save.cs = hsave->save.cs;
+ svm->vmcb->save.ss = hsave->save.ss;
+ svm->vmcb->save.ds = hsave->save.ds;
+ svm->vmcb->save.gdtr = hsave->save.gdtr;
+ svm->vmcb->save.idtr = hsave->save.idtr;
+ svm->vmcb->save.rflags = hsave->save.rflags;
+ svm_set_efer(&svm->vcpu, hsave->save.efer);
+ svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(&svm->vcpu, hsave->save.cr4);
+ if (npt_enabled) {
+ svm->vmcb->save.cr3 = hsave->save.cr3;
+ svm->vcpu.arch.cr3 = hsave->save.cr3;
+ } else {
+ (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+ }
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
+ svm->vmcb->save.dr7 = 0;
+ svm->vmcb->save.cpl = 0;
+ svm->vmcb->control.exit_int_info = 0;
+
+ mark_all_dirty(svm->vmcb);
+
+ nested_svm_unmap(page);
+
+ nested_svm_uninit_mmu_context(&svm->vcpu);
+ kvm_mmu_reset_context(&svm->vcpu);
+ kvm_mmu_load(&svm->vcpu);
+
+ return 0;
+}
+
+static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+{
+ /*
+ * This function merges the msr permission bitmaps of kvm and the
+ * nested vmcb. It is omptimized in that it only merges the parts where
+ * the kvm msr permission bitmap may contain zero bits
+ */
+ int i;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return true;
+
+ for (i = 0; i < MSRPM_OFFSETS; i++) {
+ u32 value, p;
+ u64 offset;
+
+ if (msrpm_offsets[i] == 0xffffffff)
+ break;
+
+ p = msrpm_offsets[i];
+ offset = svm->nested.vmcb_msrpm + (p * 4);
+
+ if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+ return false;
+
+ svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ }
+
+ svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+
+ return true;
+}
+
+static bool nested_vmcb_checks(struct vmcb *vmcb)
+{
+ if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+ return false;
+
+ if (vmcb->control.asid == 0)
+ return false;
+
+ if (vmcb->control.nested_ctl && !npt_enabled)
+ return false;
+
+ return true;
+}
+
+static bool nested_svm_vmrun(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+ u64 vmcb_gpa;
+
+ vmcb_gpa = svm->vmcb->save.rax;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return false;
+
+ if (!nested_vmcb_checks(nested_vmcb)) {
+ nested_vmcb->control.exit_code = SVM_EXIT_ERR;
+ nested_vmcb->control.exit_code_hi = 0;
+ nested_vmcb->control.exit_info_1 = 0;
+ nested_vmcb->control.exit_info_2 = 0;
+
+ nested_svm_unmap(page);
+
+ return false;
+ }
+
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
+ nested_vmcb->save.rip,
+ nested_vmcb->control.int_ctl,
+ nested_vmcb->control.event_inj,
+ nested_vmcb->control.nested_ctl);
+
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
+ nested_vmcb->control.intercept_exceptions,
+ nested_vmcb->control.intercept);
+
+ /* Clear internal status */
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ /*
+ * Save the old vmcb, so we don't need to pick what we save, but can
+ * restore everything when a VMEXIT occurs
+ */
+ hsave->save.es = vmcb->save.es;
+ hsave->save.cs = vmcb->save.cs;
+ hsave->save.ss = vmcb->save.ss;
+ hsave->save.ds = vmcb->save.ds;
+ hsave->save.gdtr = vmcb->save.gdtr;
+ hsave->save.idtr = vmcb->save.idtr;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ hsave->save.cr4 = svm->vcpu.arch.cr4;
+ hsave->save.rflags = vmcb->save.rflags;
+ hsave->save.rip = kvm_rip_read(&svm->vcpu);
+ hsave->save.rsp = vmcb->save.rsp;
+ hsave->save.rax = vmcb->save.rax;
+ if (npt_enabled)
+ hsave->save.cr3 = vmcb->save.cr3;
+ else
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
+
+ copy_vmcb_control_area(hsave, vmcb);
+
+ if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
+ svm->vcpu.arch.hflags |= HF_HIF_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+
+ if (nested_vmcb->control.nested_ctl) {
+ kvm_mmu_unload(&svm->vcpu);
+ svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+ nested_svm_init_mmu_context(&svm->vcpu);
+ }
+
+ /* Load the nested guest state */
+ svm->vmcb->save.es = nested_vmcb->save.es;
+ svm->vmcb->save.cs = nested_vmcb->save.cs;
+ svm->vmcb->save.ss = nested_vmcb->save.ss;
+ svm->vmcb->save.ds = nested_vmcb->save.ds;
+ svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
+ svm->vmcb->save.idtr = nested_vmcb->save.idtr;
+ svm->vmcb->save.rflags = nested_vmcb->save.rflags;
+ svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
+ svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
+ svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
+ if (npt_enabled) {
+ svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
+ svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
+ } else
+ (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+
+ /* Guest paging mode is active - reset mmu */
+ kvm_mmu_reset_context(&svm->vcpu);
+
+ svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+
+ /* In case we don't even reach vcpu_run, the fields are not updated */
+ svm->vmcb->save.rax = nested_vmcb->save.rax;
+ svm->vmcb->save.rsp = nested_vmcb->save.rsp;
+ svm->vmcb->save.rip = nested_vmcb->save.rip;
+ svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
+ svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
+ svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+
+ svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
+ svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
+
+ /* cache intercepts */
+ svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
+ svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
+ svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
+ svm->nested.intercept = nested_vmcb->control.intercept;
+
+ svm_flush_tlb(&svm->vcpu);
+ svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
+ if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
+ svm->vcpu.arch.hflags |= HF_VINTR_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+
+ if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+ /* We only want the cr8 intercept bits of the guest */
+ clr_cr_intercept(svm, INTERCEPT_CR8_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ }
+
+ /* We don't want to see VMMCALLs from a nested guest */
+ clr_intercept(svm, INTERCEPT_VMMCALL);
+
+ svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
+ svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
+ svm->vmcb->control.int_state = nested_vmcb->control.int_state;
+ svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
+ svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
+ svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
+
+ nested_svm_unmap(page);
+
+ /* Enter Guest-Mode */
+ enter_guest_mode(&svm->vcpu);
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take affect here
+ */
+ recalc_intercepts(svm);
+
+ svm->nested.vmcb = vmcb_gpa;
+
+ enable_gif(svm);
+
+ mark_all_dirty(svm->vmcb);
+
+ return true;
+}
+
+static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ to_vmcb->save.fs = from_vmcb->save.fs;
+ to_vmcb->save.gs = from_vmcb->save.gs;
+ to_vmcb->save.tr = from_vmcb->save.tr;
+ to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+ to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+ to_vmcb->save.star = from_vmcb->save.star;
+ to_vmcb->save.lstar = from_vmcb->save.lstar;
+ to_vmcb->save.cstar = from_vmcb->save.cstar;
+ to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+ to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+ to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+ to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+}
+
+static int vmload_interception(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct page *page;
+
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return 1;
+
+ nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
+ nested_svm_unmap(page);
+
+ return 1;
+}
+
+static int vmsave_interception(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct page *page;
+
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return 1;
+
+ nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
+ nested_svm_unmap(page);
+
+ return 1;
+}
+
+static int vmrun_interception(struct vcpu_svm *svm)
+{
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ /* Save rip after vmrun instruction */
+ kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
+
+ if (!nested_svm_vmrun(svm))
+ return 1;
+
+ if (!nested_svm_vmrun_msrpm(svm))
+ goto failed;
+
+ return 1;
+
+failed:
+
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
+
+ return 1;
+}
+
+static int stgi_interception(struct vcpu_svm *svm)
+{
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ enable_gif(svm);
+
+ return 1;
+}
+
+static int clgi_interception(struct vcpu_svm *svm)
+{
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+
+ disable_gif(svm);
+
+ /* After a CLGI no interrupts should come */
+ svm_clear_vintr(svm);
+ svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+
+ mark_dirty(svm->vmcb, VMCB_INTR);
+
+ return 1;
+}
+
+static int invlpga_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
+ vcpu->arch.regs[VCPU_REGS_RAX]);
+
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+ return 1;
+}
+
+static int skinit_interception(struct vcpu_svm *svm)
+{
+ trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
+
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int xsetbv_interception(struct vcpu_svm *svm)
+{
+ u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
+ u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+
+ if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+ }
+
+ return 1;
+}
+
+static int invalid_op_interception(struct vcpu_svm *svm)
+{
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int task_switch_interception(struct vcpu_svm *svm)
+{
+ u16 tss_selector;
+ int reason;
+ int int_type = svm->vmcb->control.exit_int_info &
+ SVM_EXITINTINFO_TYPE_MASK;
+ int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+ uint32_t type =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+ uint32_t idt_v =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+ bool has_error_code = false;
+ u32 error_code = 0;
+
+ tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+ reason = TASK_SWITCH_IRET;
+ else if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+ reason = TASK_SWITCH_JMP;
+ else if (idt_v)
+ reason = TASK_SWITCH_GATE;
+ else
+ reason = TASK_SWITCH_CALL;
+
+ if (reason == TASK_SWITCH_GATE) {
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ svm->vcpu.arch.nmi_injected = false;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+ has_error_code = true;
+ error_code =
+ (u32)svm->vmcb->control.exit_info_2;
+ }
+ kvm_clear_exception_queue(&svm->vcpu);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_clear_interrupt_queue(&svm->vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (reason != TASK_SWITCH_GATE ||
+ int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+ (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
+ skip_emulated_instruction(&svm->vcpu);
+
+ if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
+ svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ svm->vcpu.run->internal.ndata = 0;
+ return 0;
+ }
+ return 1;
+}
+
+static int cpuid_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ kvm_emulate_cpuid(&svm->vcpu);
+ return 1;
+}
+
+static int iret_interception(struct vcpu_svm *svm)
+{
+ ++svm->vcpu.stat.nmi_window_exits;
+ clr_intercept(svm, INTERCEPT_IRET);
+ svm->vcpu.arch.hflags |= HF_IRET_MASK;
+ return 1;
+}
+
+static int invlpg_interception(struct vcpu_svm *svm)
+{
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+
+ kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
+ skip_emulated_instruction(&svm->vcpu);
+ return 1;
+}
+
+static int emulate_on_interception(struct vcpu_svm *svm)
+{
+ return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct vcpu_svm *svm)
+{
+ int reg, cr;
+ unsigned long val;
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+ err = 0;
+ if (cr >= 16) { /* mov to cr */
+ cr -= 16;
+ val = kvm_register_read(&svm->vcpu, reg);
+ switch (cr) {
+ case 0:
+ err = kvm_set_cr0(&svm->vcpu, val);
+ break;
+ case 3:
+ err = kvm_set_cr3(&svm->vcpu, val);
+ break;
+ case 4:
+ err = kvm_set_cr4(&svm->vcpu, val);
+ break;
+ case 8:
+ err = kvm_set_cr8(&svm->vcpu, val);
+ break;
+ default:
+ WARN(1, "unhandled write to CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ } else { /* mov from cr */
+ switch (cr) {
+ case 0:
+ val = kvm_read_cr0(&svm->vcpu);
+ break;
+ case 2:
+ val = svm->vcpu.arch.cr2;
+ break;
+ case 3:
+ val = kvm_read_cr3(&svm->vcpu);
+ break;
+ case 4:
+ val = kvm_read_cr4(&svm->vcpu);
+ break;
+ case 8:
+ val = kvm_get_cr8(&svm->vcpu);
+ break;
+ default:
+ WARN(1, "unhandled read from CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ kvm_register_write(&svm->vcpu, reg, val);
+ }
+ kvm_complete_insn_gp(&svm->vcpu, err);
+
+ return 1;
+}
+
+static int cr0_write_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ int r;
+
+ r = cr_interception(svm);
+
+ if (svm->nested.vmexit_rip) {
+ kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
+ svm->nested.vmexit_rip = 0;
+ }
+
+ return r;
+}
+
+static int dr_interception(struct vcpu_svm *svm)
+{
+ int reg, dr;
+ unsigned long val;
+ int err;
+
+ if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+
+ if (dr >= 16) { /* mov to DRn */
+ val = kvm_register_read(&svm->vcpu, reg);
+ kvm_set_dr(&svm->vcpu, dr - 16, val);
+ } else {
+ err = kvm_get_dr(&svm->vcpu, dr, &val);
+ if (!err)
+ kvm_register_write(&svm->vcpu, reg, val);
+ }
+
+ skip_emulated_instruction(&svm->vcpu);
+
+ return 1;
+}
+
+static int cr8_write_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+ int r;
+
+ u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+ /* instruction emulation calls kvm_set_cr8() */
+ r = cr_interception(svm);
+ if (irqchip_in_kernel(svm->vcpu.kvm)) {
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ return r;
+ }
+ if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+ return r;
+ kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+}
+
+static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ switch (ecx) {
+ case MSR_IA32_TSC: {
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ *data = vmcb->control.tsc_offset + kvm_native_read_tsc();
+ break;
+ }
+ case MSR_STAR:
+ *data = svm->vmcb->save.star;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ *data = svm->vmcb->save.lstar;
+ break;
+ case MSR_CSTAR:
+ *data = svm->vmcb->save.cstar;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ *data = svm->vmcb->save.kernel_gs_base;
+ break;
+ case MSR_SYSCALL_MASK:
+ *data = svm->vmcb->save.sfmask;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ *data = svm->vmcb->save.sysenter_cs;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ *data = svm->sysenter_eip;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ *data = svm->sysenter_esp;
+ break;
+ /*
+ * Nobody will change the following 5 values in the VMCB so we can
+ * safely return them on rdmsr. They will always be 0 until LBRV is
+ * implemented.
+ */
+ case MSR_IA32_DEBUGCTLMSR:
+ *data = svm->vmcb->save.dbgctl;
+ break;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ *data = svm->vmcb->save.br_from;
+ break;
+ case MSR_IA32_LASTBRANCHTOIP:
+ *data = svm->vmcb->save.br_to;
+ break;
+ case MSR_IA32_LASTINTFROMIP:
+ *data = svm->vmcb->save.last_excp_from;
+ break;
+ case MSR_IA32_LASTINTTOIP:
+ *data = svm->vmcb->save.last_excp_to;
+ break;
+ case MSR_VM_HSAVE_PA:
+ *data = svm->nested.hsave_msr;
+ break;
+ case MSR_VM_CR:
+ *data = svm->nested.vm_cr_msr;
+ break;
+ case MSR_IA32_UCODE_REV:
+ *data = 0x01000065;
+ break;
+ default:
+ return kvm_get_msr_common(vcpu, ecx, data);
+ }
+ return 0;
+}
+
+static int rdmsr_interception(struct vcpu_svm *svm)
+{
+ u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ u64 data;
+
+ if (svm_get_msr(&svm->vcpu, ecx, &data)) {
+ trace_kvm_msr_read_ex(ecx);
+ kvm_inject_gp(&svm->vcpu, 0);
+ } else {
+ trace_kvm_msr_read(ecx, data);
+
+ svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
+ svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ skip_emulated_instruction(&svm->vcpu);
+ }
+ return 1;
+}
+
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int svm_dis, chg_mask;
+
+ if (data & ~SVM_VM_CR_VALID_MASK)
+ return 1;
+
+ chg_mask = SVM_VM_CR_VALID_MASK;
+
+ if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+ chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+ svm->nested.vm_cr_msr &= ~chg_mask;
+ svm->nested.vm_cr_msr |= (data & chg_mask);
+
+ svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+ /* check for svm_disable while efer.svme is set */
+ if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+ return 1;
+
+ return 0;
+}
+
+static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ switch (ecx) {
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, data);
+ break;
+ case MSR_STAR:
+ svm->vmcb->save.star = data;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ svm->vmcb->save.lstar = data;
+ break;
+ case MSR_CSTAR:
+ svm->vmcb->save.cstar = data;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ svm->vmcb->save.kernel_gs_base = data;
+ break;
+ case MSR_SYSCALL_MASK:
+ svm->vmcb->save.sfmask = data;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ svm->vmcb->save.sysenter_cs = data;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ svm->sysenter_eip = data;
+ svm->vmcb->save.sysenter_eip = data;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ svm->sysenter_esp = data;
+ svm->vmcb->save.sysenter_esp = data;
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!boot_cpu_has(X86_FEATURE_LBRV)) {
+ pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+ __func__, data);
+ break;
+ }
+ if (data & DEBUGCTL_RESERVED_BITS)
+ return 1;
+
+ svm->vmcb->save.dbgctl = data;
+ mark_dirty(svm->vmcb, VMCB_LBR);
+ if (data & (1ULL<<0))
+ svm_enable_lbrv(svm);
+ else
+ svm_disable_lbrv(svm);
+ break;
+ case MSR_VM_HSAVE_PA:
+ svm->nested.hsave_msr = data;
+ break;
+ case MSR_VM_CR:
+ return svm_set_vm_cr(vcpu, data);
+ case MSR_VM_IGNNE:
+ pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ break;
+ default:
+ return kvm_set_msr_common(vcpu, ecx, data);
+ }
+ return 0;
+}
+
+static int wrmsr_interception(struct vcpu_svm *svm)
+{
+ u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
+ | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ if (svm_set_msr(&svm->vcpu, ecx, data)) {
+ trace_kvm_msr_write_ex(ecx, data);
+ kvm_inject_gp(&svm->vcpu, 0);
+ } else {
+ trace_kvm_msr_write(ecx, data);
+ skip_emulated_instruction(&svm->vcpu);
+ }
+ return 1;
+}
+
+static int msr_interception(struct vcpu_svm *svm)
+{
+ if (svm->vmcb->control.exit_info_1)
+ return wrmsr_interception(svm);
+ else
+ return rdmsr_interception(svm);
+}
+
+static int interrupt_window_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ svm_clear_vintr(svm);
+ svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
+ /*
+ * If the user space waits to inject interrupts, exit as soon as
+ * possible
+ */
+ if (!irqchip_in_kernel(svm->vcpu.kvm) &&
+ kvm_run->request_interrupt_window &&
+ !kvm_cpu_has_interrupt(&svm->vcpu)) {
+ ++svm->vcpu.stat.irq_window_exits;
+ kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int pause_interception(struct vcpu_svm *svm)
+{
+ kvm_vcpu_on_spin(&(svm->vcpu));
+ return 1;
+}
+
+static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
+ [SVM_EXIT_READ_CR0] = cr_interception,
+ [SVM_EXIT_READ_CR3] = cr_interception,
+ [SVM_EXIT_READ_CR4] = cr_interception,
+ [SVM_EXIT_READ_CR8] = cr_interception,
+ [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
+ [SVM_EXIT_WRITE_CR3] = cr_interception,
+ [SVM_EXIT_WRITE_CR4] = cr_interception,
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
+ [SVM_EXIT_READ_DR0] = dr_interception,
+ [SVM_EXIT_READ_DR1] = dr_interception,
+ [SVM_EXIT_READ_DR2] = dr_interception,
+ [SVM_EXIT_READ_DR3] = dr_interception,
+ [SVM_EXIT_READ_DR4] = dr_interception,
+ [SVM_EXIT_READ_DR5] = dr_interception,
+ [SVM_EXIT_READ_DR6] = dr_interception,
+ [SVM_EXIT_READ_DR7] = dr_interception,
+ [SVM_EXIT_WRITE_DR0] = dr_interception,
+ [SVM_EXIT_WRITE_DR1] = dr_interception,
+ [SVM_EXIT_WRITE_DR2] = dr_interception,
+ [SVM_EXIT_WRITE_DR3] = dr_interception,
+ [SVM_EXIT_WRITE_DR4] = dr_interception,
+ [SVM_EXIT_WRITE_DR5] = dr_interception,
+ [SVM_EXIT_WRITE_DR6] = dr_interception,
+ [SVM_EXIT_WRITE_DR7] = dr_interception,
+ [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
+ [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
+ [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
+ [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
+ [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
+ [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_INTR] = intr_interception,
+ [SVM_EXIT_NMI] = nmi_interception,
+ [SVM_EXIT_SMI] = nop_on_interception,
+ [SVM_EXIT_INIT] = nop_on_interception,
+ [SVM_EXIT_VINTR] = interrupt_window_interception,
+ [SVM_EXIT_CPUID] = cpuid_interception,
+ [SVM_EXIT_IRET] = iret_interception,
+ [SVM_EXIT_INVD] = emulate_on_interception,
+ [SVM_EXIT_PAUSE] = pause_interception,
+ [SVM_EXIT_HLT] = halt_interception,
+ [SVM_EXIT_INVLPG] = invlpg_interception,
+ [SVM_EXIT_INVLPGA] = invlpga_interception,
+ [SVM_EXIT_IOIO] = io_interception,
+ [SVM_EXIT_MSR] = msr_interception,
+ [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
+ [SVM_EXIT_SHUTDOWN] = shutdown_interception,
+ [SVM_EXIT_VMRUN] = vmrun_interception,
+ [SVM_EXIT_VMMCALL] = vmmcall_interception,
+ [SVM_EXIT_VMLOAD] = vmload_interception,
+ [SVM_EXIT_VMSAVE] = vmsave_interception,
+ [SVM_EXIT_STGI] = stgi_interception,
+ [SVM_EXIT_CLGI] = clgi_interception,
+ [SVM_EXIT_SKINIT] = skinit_interception,
+ [SVM_EXIT_WBINVD] = emulate_on_interception,
+ [SVM_EXIT_MONITOR] = invalid_op_interception,
+ [SVM_EXIT_MWAIT] = invalid_op_interception,
+ [SVM_EXIT_XSETBV] = xsetbv_interception,
+ [SVM_EXIT_NPF] = pf_interception,
+};
+
+void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ pr_err("VMCB Control Area:\n");
+ pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
+ pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
+ pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
+ pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
+ pr_err("exceptions: %08x\n", control->intercept_exceptions);
+ pr_err("intercepts: %016llx\n", control->intercept);
+ pr_err("pause filter count: %d\n", control->pause_filter_count);
+ pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
+ pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
+ pr_err("tsc_offset: %016llx\n", control->tsc_offset);
+ pr_err("asid: %d\n", control->asid);
+ pr_err("tlb_ctl: %d\n", control->tlb_ctl);
+ pr_err("int_ctl: %08x\n", control->int_ctl);
+ pr_err("int_vector: %08x\n", control->int_vector);
+ pr_err("int_state: %08x\n", control->int_state);
+ pr_err("exit_code: %08x\n", control->exit_code);
+ pr_err("exit_info1: %016llx\n", control->exit_info_1);
+ pr_err("exit_info2: %016llx\n", control->exit_info_2);
+ pr_err("exit_int_info: %08x\n", control->exit_int_info);
+ pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
+ pr_err("nested_ctl: %lld\n", control->nested_ctl);
+ pr_err("nested_cr3: %016llx\n", control->nested_cr3);
+ pr_err("event_inj: %08x\n", control->event_inj);
+ pr_err("event_inj_err: %08x\n", control->event_inj_err);
+ pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
+ pr_err("next_rip: %016llx\n", control->next_rip);
+ pr_err("VMCB State Save Area:\n");
+ pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->fs.selector, save->fs.attrib,
+ save->fs.limit, save->fs.base);
+ pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->gs.selector, save->gs.attrib,
+ save->gs.limit, save->gs.base);
+ pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ldtr.selector, save->ldtr.attrib,
+ save->ldtr.limit, save->ldtr.base);
+ pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->tr.selector, save->tr.attrib,
+ save->tr.limit, save->tr.base);
+ pr_err("cpl: %d efer: %016llx\n",
+ save->cpl, save->efer);
+ pr_err("cr0: %016llx cr2: %016llx\n",
+ save->cr0, save->cr2);
+ pr_err("cr3: %016llx cr4: %016llx\n",
+ save->cr3, save->cr4);
+ pr_err("dr6: %016llx dr7: %016llx\n",
+ save->dr6, save->dr7);
+ pr_err("rip: %016llx rflags: %016llx\n",
+ save->rip, save->rflags);
+ pr_err("rsp: %016llx rax: %016llx\n",
+ save->rsp, save->rax);
+ pr_err("star: %016llx lstar: %016llx\n",
+ save->star, save->lstar);
+ pr_err("cstar: %016llx sfmask: %016llx\n",
+ save->cstar, save->sfmask);
+ pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
+ save->kernel_gs_base, save->sysenter_cs);
+ pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
+ save->sysenter_esp, save->sysenter_eip);
+ pr_err("gpat: %016llx dbgctl: %016llx\n",
+ save->g_pat, save->dbgctl);
+ pr_err("br_from: %016llx br_to: %016llx\n",
+ save->br_from, save->br_to);
+ pr_err("excp_from: %016llx excp_to: %016llx\n",
+ save->last_excp_from, save->last_excp_to);
+
+}
+
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *info1 = control->exit_info_1;
+ *info2 = control->exit_info_2;
+}
+
+static int handle_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+
+ if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+
+ if (unlikely(svm->nested.exit_required)) {
+ nested_svm_vmexit(svm);
+ svm->nested.exit_required = false;
+
+ return 1;
+ }
+
+ if (is_guest_mode(vcpu)) {
+ int vmexit;
+
+ trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
+ svm->vmcb->control.exit_info_1,
+ svm->vmcb->control.exit_info_2,
+ svm->vmcb->control.exit_int_info,
+ svm->vmcb->control.exit_int_info_err);
+
+ vmexit = nested_svm_exit_special(svm);
+
+ if (vmexit == NESTED_EXIT_CONTINUE)
+ vmexit = nested_svm_exit_handled(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ return 1;
+ }
+
+ svm_complete_interrupts(svm);
+
+ if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+ kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ kvm_run->fail_entry.hardware_entry_failure_reason
+ = svm->vmcb->control.exit_code;
+ pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
+ dump_vmcb(vcpu);
+ return 0;
+ }
+
+ if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
+ exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+ exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+ exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
+ printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
+ "exit_code 0x%x\n",
+ __func__, svm->vmcb->control.exit_int_info,
+ exit_code);
+
+ if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+ || !svm_exit_handlers[exit_code]) {
+ kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+ kvm_run->hw.hardware_exit_reason = exit_code;
+ return 0;
+ }
+
+ return svm_exit_handlers[exit_code](svm);
+}
+
+static void reload_tss(struct kvm_vcpu *vcpu)
+{
+ int cpu = raw_smp_processor_id();
+
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ sd->tss_desc->type = 9; /* available 32/64-bit TSS */
+ load_TR_desc();
+}
+
+static void pre_svm_run(struct vcpu_svm *svm)
+{
+ int cpu = raw_smp_processor_id();
+
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+
+ /* FIXME: handle wraparound of asid_generation */
+ if (svm->asid_generation != sd->asid_generation)
+ new_asid(svm, sd);
+}
+
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+ vcpu->arch.hflags |= HF_NMI_MASK;
+ set_intercept(svm, INTERCEPT_IRET);
+ ++vcpu->stat.nmi_injections;
+}
+
+static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
+{
+ struct vmcb_control_area *control;
+
+ control = &svm->vmcb->control;
+ control->int_vector = irq;
+ control->int_ctl &= ~V_INTR_PRIO_MASK;
+ control->int_ctl |= V_IRQ_MASK |
+ ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static void svm_set_irq(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ BUG_ON(!(gif_set(svm)));
+
+ trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+ ++vcpu->stat.irq_injections;
+
+ svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
+ SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ return;
+
+ if (irr == -1)
+ return;
+
+ if (tpr >= irr)
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+}
+
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ int ret;
+ ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+ !(svm->vcpu.arch.hflags & HF_NMI_MASK);
+ ret = ret && gif_set(svm) && nested_svm_nmi(svm);
+
+ return ret;
+}
+
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (masked) {
+ svm->vcpu.arch.hflags |= HF_NMI_MASK;
+ set_intercept(svm, INTERCEPT_IRET);
+ } else {
+ svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+ clr_intercept(svm, INTERCEPT_IRET);
+ }
+}
+
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ int ret;
+
+ if (!gif_set(svm) ||
+ (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
+ return 0;
+
+ ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
+
+ if (is_guest_mode(vcpu))
+ return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
+
+ return ret;
+}
+
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+ * 1, because that's a separate STGI/VMRUN intercept. The next time we
+ * get that intercept, this function will be called again though and
+ * we'll get the vintr intercept.
+ */
+ if (gif_set(svm) && nested_svm_intr(svm)) {
+ svm_set_vintr(svm);
+ svm_inject_irq(svm, 0x0);
+ }
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
+ == HF_NMI_MASK)
+ return; /* IRET will cause a vm exit */
+
+ /*
+ * Something prevents NMI from been injected. Single step over possible
+ * problem (IRET or exception injection or interrupt shadow)
+ */
+ svm->nmi_singlestep = true;
+ svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+ update_db_intercept(vcpu);
+}
+
+static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ return 0;
+}
+
+static void svm_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ else
+ svm->asid_generation--;
+}
+
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ return;
+
+ if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
+ int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
+ kvm_set_cr8(vcpu, cr8);
+ }
+}
+
+static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr8;
+
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ return;
+
+ cr8 = kvm_get_cr8(vcpu);
+ svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
+ svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
+}
+
+static void svm_complete_interrupts(struct vcpu_svm *svm)
+{
+ u8 vector;
+ int type;
+ u32 exitintinfo = svm->vmcb->control.exit_int_info;
+ unsigned int3_injected = svm->int3_injected;
+
+ svm->int3_injected = 0;
+
+ if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
+ svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
+
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+ type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ svm->vcpu.arch.nmi_injected = true;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ /*
+ * In case of software exceptions, do not reinject the vector,
+ * but re-execute the instruction instead. Rewind RIP first
+ * if we emulated INT3 before.
+ */
+ if (kvm_exception_is_soft(vector)) {
+ if (vector == BP_VECTOR && int3_injected &&
+ kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
+ kvm_rip_write(&svm->vcpu,
+ kvm_rip_read(&svm->vcpu) -
+ int3_injected);
+ break;
+ }
+ if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
+ u32 err = svm->vmcb->control.exit_int_info_err;
+ kvm_requeue_exception_e(&svm->vcpu, vector, err);
+
+ } else
+ kvm_requeue_exception(&svm->vcpu, vector);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_queue_interrupt(&svm->vcpu, vector, false);
+ break;
+ default:
+ break;
+ }
+}
+
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ control->exit_int_info = control->event_inj;
+ control->exit_int_info_err = control->event_inj_err;
+ control->event_inj = 0;
+ svm_complete_interrupts(svm);
+}
+
+#ifdef CONFIG_X86_64
+#define R "r"
+#else
+#define R "e"
+#endif
+
+static void svm_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ /*
+ * A vmexit emulation is required before the vcpu can be executed
+ * again.
+ */
+ if (unlikely(svm->nested.exit_required))
+ return;
+
+ pre_svm_run(svm);
+
+ sync_lapic_to_cr8(vcpu);
+
+ svm->vmcb->save.cr2 = vcpu->arch.cr2;
+
+ clgi();
+
+ local_irq_enable();
+
+ asm volatile (
+ "push %%"R"bp; \n\t"
+ "mov %c[rbx](%[svm]), %%"R"bx \n\t"
+ "mov %c[rcx](%[svm]), %%"R"cx \n\t"
+ "mov %c[rdx](%[svm]), %%"R"dx \n\t"
+ "mov %c[rsi](%[svm]), %%"R"si \n\t"
+ "mov %c[rdi](%[svm]), %%"R"di \n\t"
+ "mov %c[rbp](%[svm]), %%"R"bp \n\t"
+#ifdef CONFIG_X86_64
+ "mov %c[r8](%[svm]), %%r8 \n\t"
+ "mov %c[r9](%[svm]), %%r9 \n\t"
+ "mov %c[r10](%[svm]), %%r10 \n\t"
+ "mov %c[r11](%[svm]), %%r11 \n\t"
+ "mov %c[r12](%[svm]), %%r12 \n\t"
+ "mov %c[r13](%[svm]), %%r13 \n\t"
+ "mov %c[r14](%[svm]), %%r14 \n\t"
+ "mov %c[r15](%[svm]), %%r15 \n\t"
+#endif
+
+ /* Enter guest mode */
+ "push %%"R"ax \n\t"
+ "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
+ __ex(SVM_VMLOAD) "\n\t"
+ __ex(SVM_VMRUN) "\n\t"
+ __ex(SVM_VMSAVE) "\n\t"
+ "pop %%"R"ax \n\t"
+
+ /* Save guest registers, load host registers */
+ "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
+ "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
+ "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
+ "mov %%"R"si, %c[rsi](%[svm]) \n\t"
+ "mov %%"R"di, %c[rdi](%[svm]) \n\t"
+ "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
+#ifdef CONFIG_X86_64
+ "mov %%r8, %c[r8](%[svm]) \n\t"
+ "mov %%r9, %c[r9](%[svm]) \n\t"
+ "mov %%r10, %c[r10](%[svm]) \n\t"
+ "mov %%r11, %c[r11](%[svm]) \n\t"
+ "mov %%r12, %c[r12](%[svm]) \n\t"
+ "mov %%r13, %c[r13](%[svm]) \n\t"
+ "mov %%r14, %c[r14](%[svm]) \n\t"
+ "mov %%r15, %c[r15](%[svm]) \n\t"
+#endif
+ "pop %%"R"bp"
+ :
+ : [svm]"a"(svm),
+ [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
+ [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
+ [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
+ [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
+ [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
+ [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
+ [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
+#ifdef CONFIG_X86_64
+ , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
+ [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
+ [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
+ [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
+ [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
+ [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
+ [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
+ [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
+#endif
+ : "cc", "memory"
+ , R"bx", R"cx", R"dx", R"si", R"di"
+#ifdef CONFIG_X86_64
+ , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#endif
+ );
+
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+#else
+ loadsegment(fs, svm->host.fs);
+#endif
+
+ reload_tss(vcpu);
+
+ local_irq_disable();
+
+ stgi();
+
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+
+ sync_cr8_to_lapic(vcpu);
+
+ svm->next_rip = 0;
+
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* if exit due to PF check for async PF */
+ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ svm->apf_reason = kvm_read_and_reset_pf_reason();
+
+ if (npt_enabled) {
+ vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
+ vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
+ }
+
+ /*
+ * We need to handle MC intercepts here before the vcpu has a chance to
+ * change the physical cpu
+ */
+ if (unlikely(svm->vmcb->control.exit_code ==
+ SVM_EXIT_EXCP_BASE + MC_VECTOR))
+ svm_handle_mce(svm);
+
+ mark_all_clean(svm->vmcb);
+}
+
+#undef R
+
+static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.cr3 = root;
+ mark_dirty(svm->vmcb, VMCB_CR);
+ svm_flush_tlb(vcpu);
+}
+
+static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.nested_cr3 = root;
+ mark_dirty(svm->vmcb, VMCB_NPT);
+
+ /* Also sync guest cr3 here in case we live migrate */
+ svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
+ mark_dirty(svm->vmcb, VMCB_CR);
+
+ svm_flush_tlb(vcpu);
+}
+
+static int is_disabled(void)
+{
+ u64 vm_cr;
+
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
+ return 1;
+
+ return 0;
+}
+
+static void
+svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xd9;
+}
+
+static void svm_check_processor_compat(void *rtn)
+{
+ *(int *)rtn = 0;
+}
+
+static bool svm_cpu_has_accelerated_tpr(void)
+{
+ return false;
+}
+
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ return 0;
+}
+
+static void svm_cpuid_update(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+ switch (func) {
+ case 0x80000001:
+ if (nested)
+ entry->ecx |= (1 << 2); /* Set SVM bit */
+ break;
+ case 0x8000000A:
+ entry->eax = 1; /* SVM revision 1 */
+ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+ ASID emulation to nested SVM */
+ entry->ecx = 0; /* Reserved */
+ entry->edx = 0; /* Per default do not support any
+ additional features */
+
+ /* Support next_rip if host supports it */
+ if (boot_cpu_has(X86_FEATURE_NRIPS))
+ entry->edx |= SVM_FEATURE_NRIP;
+
+ /* Support NPT for the guest if enabled */
+ if (npt_enabled)
+ entry->edx |= SVM_FEATURE_NPT;
+
+ break;
+ }
+}
+
+static const struct trace_print_flags svm_exit_reasons_str[] = {
+ { SVM_EXIT_READ_CR0, "read_cr0" },
+ { SVM_EXIT_READ_CR3, "read_cr3" },
+ { SVM_EXIT_READ_CR4, "read_cr4" },
+ { SVM_EXIT_READ_CR8, "read_cr8" },
+ { SVM_EXIT_WRITE_CR0, "write_cr0" },
+ { SVM_EXIT_WRITE_CR3, "write_cr3" },
+ { SVM_EXIT_WRITE_CR4, "write_cr4" },
+ { SVM_EXIT_WRITE_CR8, "write_cr8" },
+ { SVM_EXIT_READ_DR0, "read_dr0" },
+ { SVM_EXIT_READ_DR1, "read_dr1" },
+ { SVM_EXIT_READ_DR2, "read_dr2" },
+ { SVM_EXIT_READ_DR3, "read_dr3" },
+ { SVM_EXIT_WRITE_DR0, "write_dr0" },
+ { SVM_EXIT_WRITE_DR1, "write_dr1" },
+ { SVM_EXIT_WRITE_DR2, "write_dr2" },
+ { SVM_EXIT_WRITE_DR3, "write_dr3" },
+ { SVM_EXIT_WRITE_DR5, "write_dr5" },
+ { SVM_EXIT_WRITE_DR7, "write_dr7" },
+ { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
+ { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
+ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
+ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" },
+ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" },
+ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" },
+ { SVM_EXIT_INTR, "interrupt" },
+ { SVM_EXIT_NMI, "nmi" },
+ { SVM_EXIT_SMI, "smi" },
+ { SVM_EXIT_INIT, "init" },
+ { SVM_EXIT_VINTR, "vintr" },
+ { SVM_EXIT_CPUID, "cpuid" },
+ { SVM_EXIT_INVD, "invd" },
+ { SVM_EXIT_HLT, "hlt" },
+ { SVM_EXIT_INVLPG, "invlpg" },
+ { SVM_EXIT_INVLPGA, "invlpga" },
+ { SVM_EXIT_IOIO, "io" },
+ { SVM_EXIT_MSR, "msr" },
+ { SVM_EXIT_TASK_SWITCH, "task_switch" },
+ { SVM_EXIT_SHUTDOWN, "shutdown" },
+ { SVM_EXIT_VMRUN, "vmrun" },
+ { SVM_EXIT_VMMCALL, "hypercall" },
+ { SVM_EXIT_VMLOAD, "vmload" },
+ { SVM_EXIT_VMSAVE, "vmsave" },
+ { SVM_EXIT_STGI, "stgi" },
+ { SVM_EXIT_CLGI, "clgi" },
+ { SVM_EXIT_SKINIT, "skinit" },
+ { SVM_EXIT_WBINVD, "wbinvd" },
+ { SVM_EXIT_MONITOR, "monitor" },
+ { SVM_EXIT_MWAIT, "mwait" },
+ { SVM_EXIT_XSETBV, "xsetbv" },
+ { SVM_EXIT_NPF, "npf" },
+ { -1, NULL }
+};
+
+static int svm_get_lpage_level(void)
+{
+ return PT_PDPE_LEVEL;
+}
+
+static bool svm_rdtscp_supported(void)
+{
+ return false;
+}
+
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
+static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ set_exception_intercept(svm, NM_VECTOR);
+ update_cr0_intercept(svm);
+}
+
+static struct kvm_x86_ops svm_x86_ops = {
+ .cpu_has_kvm_support = has_svm,
+ .disabled_by_bios = is_disabled,
+ .hardware_setup = svm_hardware_setup,
+ .hardware_unsetup = svm_hardware_unsetup,
+ .check_processor_compatibility = svm_check_processor_compat,
+ .hardware_enable = svm_hardware_enable,
+ .hardware_disable = svm_hardware_disable,
+ .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+
+ .vcpu_create = svm_create_vcpu,
+ .vcpu_free = svm_free_vcpu,
+ .vcpu_reset = svm_vcpu_reset,
+
+ .prepare_guest_switch = svm_prepare_guest_switch,
+ .vcpu_load = svm_vcpu_load,
+ .vcpu_put = svm_vcpu_put,
+
+ .set_guest_debug = svm_guest_debug,
+ .get_msr = svm_get_msr,
+ .set_msr = svm_set_msr,
+ .get_segment_base = svm_get_segment_base,
+ .get_segment = svm_get_segment,
+ .set_segment = svm_set_segment,
+ .get_cpl = svm_get_cpl,
+ .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
+ .decache_cr3 = svm_decache_cr3,
+ .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
+ .set_cr0 = svm_set_cr0,
+ .set_cr3 = svm_set_cr3,
+ .set_cr4 = svm_set_cr4,
+ .set_efer = svm_set_efer,
+ .get_idt = svm_get_idt,
+ .set_idt = svm_set_idt,
+ .get_gdt = svm_get_gdt,
+ .set_gdt = svm_set_gdt,
+ .set_dr7 = svm_set_dr7,
+ .cache_reg = svm_cache_reg,
+ .get_rflags = svm_get_rflags,
+ .set_rflags = svm_set_rflags,
+ .fpu_activate = svm_fpu_activate,
+ .fpu_deactivate = svm_fpu_deactivate,
+
+ .tlb_flush = svm_flush_tlb,
+
+ .run = svm_vcpu_run,
+ .handle_exit = handle_exit,
+ .skip_emulated_instruction = skip_emulated_instruction,
+ .set_interrupt_shadow = svm_set_interrupt_shadow,
+ .get_interrupt_shadow = svm_get_interrupt_shadow,
+ .patch_hypercall = svm_patch_hypercall,
+ .set_irq = svm_set_irq,
+ .set_nmi = svm_inject_nmi,
+ .queue_exception = svm_queue_exception,
+ .cancel_injection = svm_cancel_injection,
+ .interrupt_allowed = svm_interrupt_allowed,
+ .nmi_allowed = svm_nmi_allowed,
+ .get_nmi_mask = svm_get_nmi_mask,
+ .set_nmi_mask = svm_set_nmi_mask,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
+
+ .set_tss_addr = svm_set_tss_addr,
+ .get_tdp_level = get_npt_level,
+ .get_mt_mask = svm_get_mt_mask,
+
+ .get_exit_info = svm_get_exit_info,
+ .exit_reasons_str = svm_exit_reasons_str,
+
+ .get_lpage_level = svm_get_lpage_level,
+
+ .cpuid_update = svm_cpuid_update,
+
+ .rdtscp_supported = svm_rdtscp_supported,
+
+ .set_supported_cpuid = svm_set_supported_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
+
+ .write_tsc_offset = svm_write_tsc_offset,
+ .adjust_tsc_offset = svm_adjust_tsc_offset,
+
+ .set_tdp_cr3 = set_tdp_cr3,
+};
+
+static int __init svm_init(void)
+{
+ return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
+ __alignof__(struct vcpu_svm), THIS_MODULE);
+}
+
+static void __exit svm_exit(void)
+{
+ kvm_exit();
+}
+
+module_init(svm_init)
+module_exit(svm_exit)
diff --git a/linux/x86/timer.c b/linux/x86/timer.c
new file mode 100644
index 0000000..1fa2aa0
--- /dev/null
+++ b/linux/x86/timer.c
@@ -0,0 +1,105 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * timer support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/hrtimer.h>
+#include <asm/atomic.h>
+#include "kvm_timer.h"
+
+static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+{
+ int restart_timer = 0;
+ wait_queue_head_t *q = &vcpu->wq;
+
+ /*
+ * There is a race window between reading and incrementing, but we do
+ * not care about potentially loosing timer events in the !reinject
+ * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+ * in vcpu_enter_guest.
+ */
+ if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ /* FIXME: this code should not know anything about vcpus */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ }
+
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+
+ if (ktimer->t_ops->is_periodic(ktimer)) {
+ kvm_hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ restart_timer = 1;
+ }
+
+ return restart_timer;
+}
+
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
+{
+ int restart_timer;
+ struct kvm_vcpu *vcpu;
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+
+ vcpu = ktimer->vcpu;
+ if (!vcpu)
+ return HRTIMER_NORESTART;
+
+ restart_timer = __kvm_timer_fn(vcpu, ktimer);
+ if (restart_timer)
+ return HRTIMER_RESTART;
+ else
+ return HRTIMER_NORESTART;
+}
+
diff --git a/linux/x86/trace.h b/linux/x86/trace.h
new file mode 100644
index 0000000..1357d7c
--- /dev/null
+++ b/linux/x86/trace.h
@@ -0,0 +1,709 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_entry,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %u", __entry->vcpu_id)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3),
+ TP_ARGS(nr, a0, a1, a2, a3),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, nr )
+ __field( unsigned long, a0 )
+ __field( unsigned long, a1 )
+ __field( unsigned long, a2 )
+ __field( unsigned long, a3 )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hv_hypercall,
+ TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
+ __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
+
+ TP_STRUCT__entry(
+ __field( __u16, code )
+ __field( bool, fast )
+ __field( __u16, rep_cnt )
+ __field( __u16, rep_idx )
+ __field( __u64, ingpa )
+ __field( __u64, outgpa )
+ ),
+
+ TP_fast_assign(
+ __entry->code = code;
+ __entry->fast = fast;
+ __entry->rep_cnt = rep_cnt;
+ __entry->rep_idx = rep_idx;
+ __entry->ingpa = ingpa;
+ __entry->outgpa = outgpa;
+ ),
+
+ TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ __entry->code, __entry->fast ? "fast" : "slow",
+ __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
+ __entry->outgpa)
+);
+
+/*
+ * Tracepoint for PIO.
+ */
+TRACE_EVENT(kvm_pio,
+ TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
+ unsigned int count),
+ TP_ARGS(rw, port, size, count),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, port )
+ __field( unsigned int, size )
+ __field( unsigned int, count )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->port = port;
+ __entry->size = size;
+ __entry->count = count;
+ ),
+
+ TP_printk("pio_%s at 0x%x size %d count %d",
+ __entry->rw ? "write" : "read",
+ __entry->port, __entry->size, __entry->count)
+);
+
+/*
+ * Tracepoint for cpuid.
+ */
+TRACE_EVENT(kvm_cpuid,
+ TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
+ unsigned long rcx, unsigned long rdx),
+ TP_ARGS(function, rax, rbx, rcx, rdx),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, function )
+ __field( unsigned long, rax )
+ __field( unsigned long, rbx )
+ __field( unsigned long, rcx )
+ __field( unsigned long, rdx )
+ ),
+
+ TP_fast_assign(
+ __entry->function = function;
+ __entry->rax = rax;
+ __entry->rbx = rbx;
+ __entry->rcx = rcx;
+ __entry->rdx = rdx;
+ ),
+
+ TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
+ __entry->function, __entry->rax,
+ __entry->rbx, __entry->rcx, __entry->rdx)
+);
+
+#define AREG(x) { APIC_##x, "APIC_" #x }
+
+#define kvm_trace_symbol_apic \
+ AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \
+ AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \
+ AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \
+ AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \
+ AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \
+ AREG(ECTRL)
+/*
+ * Tracepoint for apic access.
+ */
+TRACE_EVENT(kvm_apic,
+ TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+ TP_ARGS(rw, reg, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, reg )
+ __field( unsigned int, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->reg = reg;
+ __entry->val = val;
+ ),
+
+ TP_printk("apic_%s %s = 0x%x",
+ __entry->rw ? "write" : "read",
+ __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
+ __entry->val)
+);
+
+#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
+#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+
+#define KVM_ISA_VMX 1
+#define KVM_ISA_SVM 2
+
+/*
+ * Tracepoint for kvm guest exit:
+ */
+TRACE_EVENT(kvm_exit,
+ TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
+ TP_ARGS(exit_reason, vcpu, isa),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, exit_reason )
+ __field( unsigned long, guest_rip )
+ __field( u32, isa )
+ __field( u64, info1 )
+ __field( u64, info2 )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_reason = exit_reason;
+ __entry->guest_rip = kvm_rip_read(vcpu);
+ __entry->isa = isa;
+ kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
+ &__entry->info2);
+ ),
+
+ TP_printk("reason %s rip 0x%lx info %llx %llx",
+ ftrace_print_symbols_seq(p, __entry->exit_reason,
+ kvm_x86_ops->exit_reasons_str),
+ __entry->guest_rip, __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_virq,
+ TP_PROTO(unsigned int irq),
+ TP_ARGS(irq),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ ),
+
+ TP_printk("irq %u", __entry->irq)
+);
+
+#define EXS(x) { x##_VECTOR, "#" #x }
+
+#define kvm_trace_sym_exc \
+ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
+ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
+ EXS(MF), EXS(MC)
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_exception,
+ TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
+ TP_ARGS(exception, has_error, error_code),
+
+ TP_STRUCT__entry(
+ __field( u8, exception )
+ __field( u8, has_error )
+ __field( u32, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->exception = exception;
+ __entry->has_error = has_error;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("%s (0x%x)",
+ __print_symbolic(__entry->exception, kvm_trace_sym_exc),
+ /* FIXME: don't print error_code if not present */
+ __entry->has_error ? __entry->error_code : 0)
+);
+
+/*
+ * Tracepoint for page fault.
+ */
+TRACE_EVENT(kvm_page_fault,
+ TP_PROTO(unsigned long fault_address, unsigned int error_code),
+ TP_ARGS(fault_address, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, fault_address )
+ __field( unsigned int, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->fault_address = fault_address;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("address %lx error_code %x",
+ __entry->fault_address, __entry->error_code)
+);
+
+/*
+ * Tracepoint for guest MSR access.
+ */
+TRACE_EVENT(kvm_msr,
+ TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
+ TP_ARGS(write, ecx, data, exception),
+
+ TP_STRUCT__entry(
+ __field( unsigned, write )
+ __field( u32, ecx )
+ __field( u64, data )
+ __field( u8, exception )
+ ),
+
+ TP_fast_assign(
+ __entry->write = write;
+ __entry->ecx = ecx;
+ __entry->data = data;
+ __entry->exception = exception;
+ ),
+
+ TP_printk("msr_%s %x = 0x%llx%s",
+ __entry->write ? "write" : "read",
+ __entry->ecx, __entry->data,
+ __entry->exception ? " (#GP)" : "")
+);
+
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
+#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
+#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
+
+/*
+ * Tracepoint for guest CR access.
+ */
+TRACE_EVENT(kvm_cr,
+ TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
+ TP_ARGS(rw, cr, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, cr )
+ __field( unsigned long, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->cr = cr;
+ __entry->val = val;
+ ),
+
+ TP_printk("cr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->cr, __entry->val)
+);
+
+#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
+#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
+
+TRACE_EVENT(kvm_pic_set_irq,
+ TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced),
+ TP_ARGS(chip, pin, elcr, imr, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u8, chip )
+ __field( __u8, pin )
+ __field( __u8, elcr )
+ __field( __u8, imr )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->chip = chip;
+ __entry->pin = pin;
+ __entry->elcr = elcr;
+ __entry->imr = imr;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("chip %u pin %u (%s%s)%s",
+ __entry->chip, __entry->pin,
+ (__entry->elcr & (1 << __entry->pin)) ? "level":"edge",
+ (__entry->imr & (1 << __entry->pin)) ? "|masked":"",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+#define kvm_apic_dst_shorthand \
+ {0x0, "dst"}, \
+ {0x1, "self"}, \
+ {0x2, "all"}, \
+ {0x3, "all-but-self"}
+
+TRACE_EVENT(kvm_apic_ipi,
+ TP_PROTO(__u32 icr_low, __u32 dest_id),
+ TP_ARGS(icr_low, dest_id),
+
+ TP_STRUCT__entry(
+ __field( __u32, icr_low )
+ __field( __u32, dest_id )
+ ),
+
+ TP_fast_assign(
+ __entry->icr_low = icr_low;
+ __entry->dest_id = dest_id;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)",
+ __entry->dest_id, (u8)__entry->icr_low,
+ __print_symbolic((__entry->icr_low >> 8 & 0x7),
+ kvm_deliver_mode),
+ (__entry->icr_low & (1<<11)) ? "logical" : "physical",
+ (__entry->icr_low & (1<<14)) ? "assert" : "de-assert",
+ (__entry->icr_low & (1<<15)) ? "level" : "edge",
+ __print_symbolic((__entry->icr_low >> 18 & 0x3),
+ kvm_apic_dst_shorthand))
+);
+
+TRACE_EVENT(kvm_apic_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced),
+ TP_ARGS(apicid, dm, tm, vec, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u8, tm )
+ __field( __u8, vec )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)%s",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+/*
+ * Tracepoint for nested VMRUN
+ */
+TRACE_EVENT(kvm_nested_vmrun,
+ TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
+ __u32 event_inj, bool npt),
+ TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u64, vmcb )
+ __field( __u64, nested_rip )
+ __field( __u32, int_ctl )
+ __field( __u32, event_inj )
+ __field( bool, npt )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->vmcb = vmcb;
+ __entry->nested_rip = nested_rip;
+ __entry->int_ctl = int_ctl;
+ __entry->event_inj = event_inj;
+ __entry->npt = npt;
+ ),
+
+ TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
+ "event_inj: 0x%08x npt: %s",
+ __entry->rip, __entry->vmcb, __entry->nested_rip,
+ __entry->int_ctl, __entry->event_inj,
+ __entry->npt ? "on" : "off")
+);
+
+TRACE_EVENT(kvm_nested_intercepts,
+ TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
+ TP_ARGS(cr_read, cr_write, exceptions, intercept),
+
+ TP_STRUCT__entry(
+ __field( __u16, cr_read )
+ __field( __u16, cr_write )
+ __field( __u32, exceptions )
+ __field( __u64, intercept )
+ ),
+
+ TP_fast_assign(
+ __entry->cr_read = cr_read;
+ __entry->cr_write = cr_write;
+ __entry->exceptions = exceptions;
+ __entry->intercept = intercept;
+ ),
+
+ TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
+ __entry->cr_read, __entry->cr_write, __entry->exceptions,
+ __entry->intercept)
+);
+/*
+ * Tracepoint for #VMEXIT while nested
+ */
+TRACE_EVENT(kvm_nested_vmexit,
+ TP_PROTO(__u64 rip, __u32 exit_code,
+ __u64 exit_info1, __u64 exit_info2,
+ __u32 exit_int_info, __u32 exit_int_info_err),
+ TP_ARGS(rip, exit_code, exit_info1, exit_info2,
+ exit_int_info, exit_int_info_err),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, exit_code )
+ __field( __u64, exit_info1 )
+ __field( __u64, exit_info2 )
+ __field( __u32, exit_int_info )
+ __field( __u32, exit_int_info_err )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->exit_code = exit_code;
+ __entry->exit_info1 = exit_info1;
+ __entry->exit_info2 = exit_info2;
+ __entry->exit_int_info = exit_int_info;
+ __entry->exit_int_info_err = exit_int_info_err;
+ ),
+ TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
+ __entry->rip,
+ ftrace_print_symbols_seq(p, __entry->exit_code,
+ kvm_x86_ops->exit_reasons_str),
+ __entry->exit_info1, __entry->exit_info2,
+ __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for #VMEXIT reinjected to the guest
+ */
+TRACE_EVENT(kvm_nested_vmexit_inject,
+ TP_PROTO(__u32 exit_code,
+ __u64 exit_info1, __u64 exit_info2,
+ __u32 exit_int_info, __u32 exit_int_info_err),
+ TP_ARGS(exit_code, exit_info1, exit_info2,
+ exit_int_info, exit_int_info_err),
+
+ TP_STRUCT__entry(
+ __field( __u32, exit_code )
+ __field( __u64, exit_info1 )
+ __field( __u64, exit_info2 )
+ __field( __u32, exit_int_info )
+ __field( __u32, exit_int_info_err )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_code = exit_code;
+ __entry->exit_info1 = exit_info1;
+ __entry->exit_info2 = exit_info2;
+ __entry->exit_int_info = exit_int_info;
+ __entry->exit_int_info_err = exit_int_info_err;
+ ),
+
+ TP_printk("reason: %s ext_inf1: 0x%016llx "
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
+ ftrace_print_symbols_seq(p, __entry->exit_code,
+ kvm_x86_ops->exit_reasons_str),
+ __entry->exit_info1, __entry->exit_info2,
+ __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_nested_intr_vmexit,
+ TP_PROTO(__u64 rip),
+ TP_ARGS(rip),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip
+ ),
+
+ TP_printk("rip: 0x%016llx", __entry->rip)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_invlpga,
+ TP_PROTO(__u64 rip, int asid, u64 address),
+ TP_ARGS(rip, asid, address),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( int, asid )
+ __field( __u64, address )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->asid = asid;
+ __entry->address = address;
+ ),
+
+ TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
+ __entry->rip, __entry->asid, __entry->address)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_skinit,
+ TP_PROTO(__u64 rip, __u32 slb),
+ TP_ARGS(rip, slb),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, slb )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->slb = slb;
+ ),
+
+ TP_printk("rip: 0x%016llx slb: 0x%08x",
+ __entry->rip, __entry->slb)
+);
+
+#define __print_insn(insn, ilen) ({ \
+ int i; \
+ const char *ret = p->buffer + p->len; \
+ \
+ for (i = 0; i < ilen; ++i) \
+ trace_seq_printf(p, " %02x", insn[i]); \
+ trace_seq_printf(p, "%c", 0); \
+ ret; \
+ })
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L (1 << 3)
+
+#define kvm_trace_symbol_emul_flags \
+ { 0, "real" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L, "prot64" }
+
+#define kei_decode_mode(mode) ({ \
+ u8 flags = 0xff; \
+ switch (mode) { \
+ case X86EMUL_MODE_REAL: \
+ flags = 0; \
+ break; \
+ case X86EMUL_MODE_VM86: \
+ flags = KVM_EMUL_INSN_F_EFL_VM; \
+ break; \
+ case X86EMUL_MODE_PROT16: \
+ flags = KVM_EMUL_INSN_F_CR0_PE; \
+ break; \
+ case X86EMUL_MODE_PROT32: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D; \
+ break; \
+ case X86EMUL_MODE_PROT64: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L; \
+ break; \
+ } \
+ flags; \
+ })
+
+TRACE_EVENT(kvm_emulate_insn,
+ TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+ TP_ARGS(vcpu, failed),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, csbase )
+ __field( __u8, len )
+ __array( __u8, insn, 15 )
+ __field( __u8, flags )
+ __field( __u8, failed )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
+ __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
+ __entry->len = vcpu->arch.emulate_ctxt.decode.eip
+ - vcpu->arch.emulate_ctxt.decode.fetch.start;
+ memcpy(__entry->insn,
+ vcpu->arch.emulate_ctxt.decode.fetch.data,
+ 15);
+ __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
+ __entry->failed = failed;
+ ),
+
+ TP_printk("%x:%llx:%s (%s)%s",
+ __entry->csbase, __entry->rip,
+ __print_insn(__entry->insn, __entry->len),
+ __print_symbolic(__entry->flags,
+ kvm_trace_symbol_emul_flags),
+ __entry->failed ? " failed" : ""
+ )
+ );
+
+#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
+#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+
+#endif /* _TRACE_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/linux/x86/tss.h b/linux/x86/tss.h
new file mode 100644
index 0000000..622aa10
--- /dev/null
+++ b/linux/x86/tss.h
@@ -0,0 +1,59 @@
+#ifndef __TSS_SEGMENT_H
+#define __TSS_SEGMENT_H
+
+struct tss_segment_32 {
+ u32 prev_task_link;
+ u32 esp0;
+ u32 ss0;
+ u32 esp1;
+ u32 ss1;
+ u32 esp2;
+ u32 ss2;
+ u32 cr3;
+ u32 eip;
+ u32 eflags;
+ u32 eax;
+ u32 ecx;
+ u32 edx;
+ u32 ebx;
+ u32 esp;
+ u32 ebp;
+ u32 esi;
+ u32 edi;
+ u32 es;
+ u32 cs;
+ u32 ss;
+ u32 ds;
+ u32 fs;
+ u32 gs;
+ u32 ldt_selector;
+ u16 t;
+ u16 io_map;
+};
+
+struct tss_segment_16 {
+ u16 prev_task_link;
+ u16 sp0;
+ u16 ss0;
+ u16 sp1;
+ u16 ss1;
+ u16 sp2;
+ u16 ss2;
+ u16 ip;
+ u16 flag;
+ u16 ax;
+ u16 cx;
+ u16 dx;
+ u16 bx;
+ u16 sp;
+ u16 bp;
+ u16 si;
+ u16 di;
+ u16 es;
+ u16 cs;
+ u16 ss;
+ u16 ds;
+ u16 ldt;
+};
+
+#endif
diff --git a/linux/x86/vmx.c b/linux/x86/vmx.c
new file mode 100644
index 0000000..ed4cf97
--- /dev/null
+++ b/linux/x86/vmx.c
@@ -0,0 +1,4549 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "irq.h"
+#include "mmu.h"
+
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <linux/ftrace_event.h>
+#include <linux/slab.h>
+#include <linux/tboot.h>
+#include "kvm_cache_regs.h"
+#include "x86.h"
+
+#include <asm/io.h>
+#include <asm/desc.h>
+#include <asm/vmx.h>
+#include <asm/virtext.h>
+#include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+
+#include "trace.h"
+
+#define __ex(x) __kvm_handle_fault_on_reboot(x)
+
+MODULE_INFO(version, "kvm-kmod-2.6.38-rc7");
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+static int __read_mostly bypass_guest_pf = 1;
+module_param(bypass_guest_pf, bool, S_IRUGO);
+
+static int __read_mostly enable_vpid = 1;
+module_param_named(vpid, enable_vpid, bool, 0444);
+
+static int __read_mostly flexpriority_enabled = 1;
+module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
+
+static int __read_mostly enable_ept = 1;
+module_param_named(ept, enable_ept, bool, S_IRUGO);
+
+static int __read_mostly enable_unrestricted_guest = 1;
+module_param_named(unrestricted_guest,
+ enable_unrestricted_guest, bool, S_IRUGO);
+
+static int __read_mostly emulate_invalid_guest_state = 0;
+module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+
+static int __read_mostly vmm_exclusive = 1;
+module_param(vmm_exclusive, bool, S_IRUGO);
+
+static int __read_mostly yield_on_hlt = 1;
+module_param(yield_on_hlt, bool, S_IRUGO);
+
+#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK \
+ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE)
+#define KVM_VM_CR0_ALWAYS_ON \
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_CR4_GUEST_OWNED_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT)
+
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
+#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * ple_gap: upper bound on the amount of time between two successive
+ * executions of PAUSE in a loop. Also indicate if ple enabled.
+ * According to test, this time is usually small than 41 cycles.
+ * ple_window: upper bound on the amount of time a guest is allowed to execute
+ * in a PAUSE loop. Tests indicate that most spinlocks are held for
+ * less than 2^12 cycles
+ * Time is measured based on a counter that runs at the same rate as the TSC,
+ * refer SDM volume 3b section 21.6.13 & 22.1.3.
+ */
+#define KVM_VMX_DEFAULT_PLE_GAP 41
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
+module_param(ple_gap, int, S_IRUGO);
+
+static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, int, S_IRUGO);
+
+#define NR_AUTOLOAD_MSRS 1
+
+struct vmcs {
+ u32 revision_id;
+ u32 abort;
+ char data[0];
+};
+
+struct shared_msr_entry {
+ unsigned index;
+ u64 data;
+ u64 mask;
+};
+
+struct vcpu_vmx {
+ struct kvm_vcpu vcpu;
+ struct list_head local_vcpus_link;
+ unsigned long host_rsp;
+ int launched;
+ u8 fail;
+ u32 exit_intr_info;
+ u32 idt_vectoring_info;
+ struct shared_msr_entry *guest_msrs;
+ int nmsrs;
+ int save_nmsrs;
+#ifdef CONFIG_X86_64
+ u64 msr_host_kernel_gs_base;
+ u64 msr_guest_kernel_gs_base;
+#endif
+ struct vmcs *vmcs;
+ struct msr_autoload {
+ unsigned nr;
+ struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
+ struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+ } msr_autoload;
+ struct {
+ int loaded;
+ u16 fs_sel, gs_sel, ldt_sel;
+ int gs_ldt_reload_needed;
+ int fs_reload_needed;
+ } host_state;
+ struct {
+ int vm86_active;
+ ulong save_rflags;
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } tr, es, ds, fs, gs;
+ } rmode;
+ int vpid;
+ bool emulation_required;
+
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
+ u32 exit_reason;
+
+ bool rdtscp_enabled;
+};
+
+static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
+static int init_rmode(struct kvm *kvm);
+static u64 construct_eptp(unsigned long root_hpa);
+static void kvm_cpu_vmxon(u64 addr);
+static void kvm_cpu_vmxoff(void);
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+
+static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct kvm_desc_ptr, host_gdt);
+
+static unsigned long *vmx_io_bitmap_a;
+static unsigned long *vmx_io_bitmap_b;
+static unsigned long *vmx_msr_bitmap_legacy;
+static unsigned long *vmx_msr_bitmap_longmode;
+
+static bool cpu_has_load_ia32_efer;
+
+static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
+static DEFINE_SPINLOCK(vmx_vpid_lock);
+
+static struct vmcs_config {
+ int size;
+ int order;
+ u32 revision_id;
+ u32 pin_based_exec_ctrl;
+ u32 cpu_based_exec_ctrl;
+ u32 cpu_based_2nd_exec_ctrl;
+ u32 vmexit_ctrl;
+ u32 vmentry_ctrl;
+} vmcs_config;
+
+static struct vmx_capability {
+ u32 ept;
+ u32 vpid;
+} vmx_capability;
+
+#define VMX_SEGMENT_FIELD(seg) \
+ [VCPU_SREG_##seg] = { \
+ .selector = GUEST_##seg##_SELECTOR, \
+ .base = GUEST_##seg##_BASE, \
+ .limit = GUEST_##seg##_LIMIT, \
+ .ar_bytes = GUEST_##seg##_AR_BYTES, \
+ }
+
+static struct kvm_vmx_segment_field {
+ unsigned selector;
+ unsigned base;
+ unsigned limit;
+ unsigned ar_bytes;
+} kvm_vmx_segment_fields[] = {
+ VMX_SEGMENT_FIELD(CS),
+ VMX_SEGMENT_FIELD(DS),
+ VMX_SEGMENT_FIELD(ES),
+ VMX_SEGMENT_FIELD(FS),
+ VMX_SEGMENT_FIELD(GS),
+ VMX_SEGMENT_FIELD(SS),
+ VMX_SEGMENT_FIELD(TR),
+ VMX_SEGMENT_FIELD(LDTR),
+};
+
+static u64 host_efer;
+
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+
+/*
+ * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
+ * away by decrementing the array size.
+ */
+static const u32 vmx_msr_index[] = {
+#ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+#endif
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+};
+#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
+
+static inline bool is_page_fault(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_no_device(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_invalid_opcode(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_external_interrupt(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+ == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_machine_check(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+static inline bool cpu_has_vmx_msr_bitmap(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
+}
+
+static inline bool cpu_has_vmx_tpr_shadow(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
+}
+
+static inline bool vm_need_tpr_shadow(struct kvm *kvm)
+{
+ return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+}
+
+static inline bool cpu_has_secondary_exec_ctrls(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool cpu_has_vmx_flexpriority(void)
+{
+ return cpu_has_vmx_tpr_shadow() &&
+ cpu_has_vmx_virtualize_apic_accesses();
+}
+
+static inline bool cpu_has_vmx_ept_execute_only(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
+}
+
+static inline bool cpu_has_vmx_eptp_uncacheable(void)
+{
+ return vmx_capability.ept & VMX_EPTP_UC_BIT;
+}
+
+static inline bool cpu_has_vmx_eptp_writeback(void)
+{
+ return vmx_capability.ept & VMX_EPTP_WB_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_2m_page(void)
+{
+ return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_1g_page(void)
+{
+ return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_individual_addr(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_context(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_global(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_ept(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline bool cpu_has_vmx_unrestricted_guest(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
+static inline bool cpu_has_vmx_ple(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
+static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
+{
+ return flexpriority_enabled && irqchip_in_kernel(kvm);
+}
+
+static inline bool cpu_has_vmx_vpid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline bool cpu_has_vmx_rdtscp(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDTSCP;
+}
+
+static inline bool cpu_has_virtual_nmis(void)
+{
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
+static inline bool report_flexpriority(void)
+{
+ return flexpriority_enabled;
+}
+
+static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
+{
+ int i;
+
+ for (i = 0; i < vmx->nmsrs; ++i)
+ if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
+ return i;
+ return -1;
+}
+
+static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+{
+ struct {
+ u64 vpid : 16;
+ u64 rsvd : 48;
+ u64 gva;
+ } operand = { vpid, 0, gva };
+
+ asm volatile (__ex(ASM_VMX_INVVPID)
+ /* CF==1 or ZF==1 --> rc = -1 */
+ "; ja 1f ; ud2 ; 1:"
+ : : "a"(&operand), "c"(ext) : "cc", "memory");
+}
+
+static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+{
+ struct {
+ u64 eptp, gpa;
+ } operand = {eptp, gpa};
+
+ asm volatile (__ex(ASM_VMX_INVEPT)
+ /* CF==1 or ZF==1 --> rc = -1 */
+ "; ja 1f ; ud2 ; 1:\n"
+ : : "a" (&operand), "c" (ext) : "cc", "memory");
+}
+
+static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+{
+ int i;
+
+ i = __find_msr_index(vmx, msr);
+ if (i >= 0)
+ return &vmx->guest_msrs[i];
+ return NULL;
+}
+
+static void vmcs_clear(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+ u8 error;
+
+ asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
+ : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "cc", "memory");
+ if (error)
+ printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
+ vmcs, phys_addr);
+}
+
+static void vmcs_load(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+ u8 error;
+
+ asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+ : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "cc", "memory");
+ if (error)
+ printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+ vmcs, phys_addr);
+}
+
+static void __vcpu_clear(void *arg)
+{
+ struct vcpu_vmx *vmx = arg;
+ int cpu = raw_smp_processor_id();
+
+ if (vmx->vcpu.cpu == cpu)
+ vmcs_clear(vmx->vmcs);
+ if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+ per_cpu(current_vmcs, cpu) = NULL;
+ list_del(&vmx->local_vcpus_link);
+ vmx->vcpu.cpu = -1;
+ vmx->launched = 0;
+}
+
+static void vcpu_clear(struct vcpu_vmx *vmx)
+{
+ if (vmx->vcpu.cpu == -1)
+ return;
+ smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
+}
+
+static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
+{
+ if (vmx->vpid == 0)
+ return;
+
+ if (cpu_has_vmx_invvpid_single())
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+ if (cpu_has_vmx_invvpid_global())
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+{
+ if (cpu_has_vmx_invvpid_single())
+ vpid_sync_vcpu_single(vmx);
+ else
+ vpid_sync_vcpu_global();
+}
+
+static inline void ept_sync_global(void)
+{
+ if (cpu_has_vmx_invept_global())
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+ if (enable_ept) {
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ else
+ ept_sync_global();
+ }
+}
+
+static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
+{
+ if (enable_ept) {
+ if (cpu_has_vmx_invept_individual_addr())
+ __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
+ eptp, gpa);
+ else
+ ept_sync_context(eptp);
+ }
+}
+
+static unsigned long vmcs_readl(unsigned long field)
+{
+ unsigned long value = 0;
+
+ asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
+ : "+a"(value) : "d"(field) : "cc");
+ return value;
+}
+
+static u16 vmcs_read16(unsigned long field)
+{
+ return vmcs_readl(field);
+}
+
+static u32 vmcs_read32(unsigned long field)
+{
+ return vmcs_readl(field);
+}
+
+static u64 vmcs_read64(unsigned long field)
+{
+#ifdef CONFIG_X86_64
+ return vmcs_readl(field);
+#else
+ return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
+#endif
+}
+
+static noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+ printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
+ field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+ dump_stack();
+}
+
+static void vmcs_writel(unsigned long field, unsigned long value)
+{
+ u8 error;
+
+ asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
+ : "=q"(error) : "a"(value), "d"(field) : "cc");
+ if (unlikely(error))
+ vmwrite_error(field, value);
+}
+
+static void vmcs_write16(unsigned long field, u16 value)
+{
+ vmcs_writel(field, value);
+}
+
+static void vmcs_write32(unsigned long field, u32 value)
+{
+ vmcs_writel(field, value);
+}
+
+static void vmcs_write64(unsigned long field, u64 value)
+{
+ vmcs_writel(field, value);
+#ifndef CONFIG_X86_64
+ asm volatile ("");
+ vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static void vmcs_clear_bits(unsigned long field, u32 mask)
+{
+ vmcs_writel(field, vmcs_readl(field) & ~mask);
+}
+
+static void vmcs_set_bits(unsigned long field, u32 mask)
+{
+ vmcs_writel(field, vmcs_readl(field) | mask);
+}
+
+static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ u32 eb;
+
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
+ (1u << NM_VECTOR) | (1u << DB_VECTOR);
+ if ((vcpu->guest_debug &
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
+ eb |= 1u << BP_VECTOR;
+ if (to_vmx(vcpu)->rmode.vm86_active)
+ eb = ~0;
+ if (enable_ept)
+ eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+ if (vcpu->fpu_active)
+ eb &= ~(1u << NM_VECTOR);
+ vmcs_write32(EXCEPTION_BITMAP, eb);
+}
+
+static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+ unsigned i;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+ vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+ vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+
+ for (i = 0; i < m->nr; ++i)
+ if (m->guest[i].index == msr)
+ break;
+
+ if (i == m->nr)
+ return;
+ --m->nr;
+ m->guest[i] = m->guest[m->nr];
+ m->host[i] = m->host[m->nr];
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+}
+
+static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+ u64 guest_val, u64 host_val)
+{
+ unsigned i;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+ vmcs_write64(GUEST_IA32_EFER, guest_val);
+ vmcs_write64(HOST_IA32_EFER, host_val);
+ vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+ vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+
+ for (i = 0; i < m->nr; ++i)
+ if (m->guest[i].index == msr)
+ break;
+
+ if (i == m->nr) {
+ ++m->nr;
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+ }
+
+ m->guest[i].index = msr;
+ m->guest[i].value = guest_val;
+ m->host[i].index = msr;
+ m->host[i].value = host_val;
+}
+
+static void reload_tss(void)
+{
+ /*
+ * VT restores TR but not its size. Useless.
+ */
+ struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt);
+ struct kvm_desc_struct *descs;
+
+ descs = (void *)gdt->address;
+ descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
+ load_TR_desc();
+}
+
+static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+{
+ u64 guest_efer;
+ u64 ignore_bits;
+
+ guest_efer = vmx->vcpu.arch.efer;
+
+ /*
+ * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+ * outside long mode
+ */
+ ignore_bits = EFER_NX | EFER_SCE;
+#ifdef CONFIG_X86_64
+ ignore_bits |= EFER_LMA | EFER_LME;
+ /* SCE is meaningful only in long mode on Intel */
+ if (guest_efer & EFER_LMA)
+ ignore_bits &= ~(u64)EFER_SCE;
+#endif
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
+ vmx->guest_msrs[efer_offset].data = guest_efer;
+ vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+ /* On ept, can't emulate nx, and must switch nx atomically */
+ if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
+ guest_efer = vmx->vcpu.arch.efer;
+ if (!(guest_efer & EFER_LMA))
+ guest_efer &= ~EFER_LME;
+ add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
+ return false;
+ }
+
+ return true;
+}
+
+static unsigned long segment_base(u16 selector)
+{
+ struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt);
+ struct kvm_desc_struct *d;
+ unsigned long table_base;
+ unsigned long v;
+
+ if (!(selector & ~3))
+ return 0;
+
+ table_base = gdt->address;
+
+ if (selector & 4) { /* from ldt */
+ u16 ldt_selector = kvm_read_ldt();
+
+ if (!(ldt_selector & ~3))
+ return 0;
+
+ table_base = segment_base(ldt_selector);
+ }
+ d = (struct kvm_desc_struct *)(table_base + (selector & ~7));
+ v = kvm_get_desc_base(d);
+#ifdef CONFIG_X86_64
+ if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+ v |= ((unsigned long)((struct kvm_ldttss_desc64 *)d)->base3) << 32;
+#endif
+ return v;
+}
+
+static inline unsigned long kvm_read_tr_base(void)
+{
+ u16 tr;
+ asm("str %0" : "=g"(tr));
+ return segment_base(tr);
+}
+
+static void vmx_save_host_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int i;
+
+ if (vmx->host_state.loaded)
+ return;
+
+ vmx->host_state.loaded = 1;
+ /*
+ * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
+ * allow segment selectors with cpl > 0 or ti == 1.
+ */
+ vmx->host_state.ldt_sel = kvm_read_ldt();
+ vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
+ savesegment(fs, vmx->host_state.fs_sel);
+ if (!(vmx->host_state.fs_sel & 7)) {
+ vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
+ vmx->host_state.fs_reload_needed = 0;
+ } else {
+ vmcs_write16(HOST_FS_SELECTOR, 0);
+ vmx->host_state.fs_reload_needed = 1;
+ }
+ savesegment(gs, vmx->host_state.gs_sel);
+ if (!(vmx->host_state.gs_sel & 7))
+ vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
+ else {
+ vmcs_write16(HOST_GS_SELECTOR, 0);
+ vmx->host_state.gs_ldt_reload_needed = 1;
+ }
+
+#ifdef CONFIG_X86_64
+ vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
+ vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
+#else
+ vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
+ vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
+#endif
+
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ if (is_long_mode(&vmx->vcpu))
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#endif
+ for (i = 0; i < vmx->save_nmsrs; ++i)
+ kvm_set_shared_msr(vmx->guest_msrs[i].index,
+ vmx->guest_msrs[i].data,
+ vmx->guest_msrs[i].mask);
+}
+
+static void __vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+ if (!vmx->host_state.loaded)
+ return;
+
+ ++vmx->vcpu.stat.host_state_reload;
+ vmx->host_state.loaded = 0;
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu))
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#endif
+ if (vmx->host_state.gs_ldt_reload_needed) {
+ kvm_load_ldt(vmx->host_state.ldt_sel);
+#ifdef CONFIG_X86_64
+ load_gs_index(vmx->host_state.gs_sel);
+#else
+ loadsegment(gs, vmx->host_state.gs_sel);
+#endif
+ }
+ if (vmx->host_state.fs_reload_needed)
+ loadsegment(fs, vmx->host_state.fs_sel);
+ reload_tss();
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+#endif
+ if (current_thread_info()->status & TS_USEDFPU)
+ clts();
+ kvm_load_gdt(&__get_cpu_var(host_gdt));
+}
+
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+ preempt_disable();
+ __vmx_load_host_state(vmx);
+ preempt_enable();
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(phys_addr);
+ else if (vcpu->cpu != cpu)
+ vcpu_clear(vmx);
+
+ if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->vmcs;
+ vmcs_load(vmx->vmcs);
+ }
+
+ if (vcpu->cpu != cpu) {
+ struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt);
+ unsigned long sysenter_esp;
+
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ local_irq_disable();
+ list_add(&vmx->local_vcpus_link,
+ &per_cpu(vcpus_on_cpu, cpu));
+ local_irq_enable();
+
+ /*
+ * Linux uses per-cpu TSS and GDT, so set these when switching
+ * processors.
+ */
+ vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
+ vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
+
+ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ }
+}
+
+static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ __vmx_load_host_state(to_vmx(vcpu));
+ if (!vmm_exclusive) {
+ __vcpu_clear(to_vmx(vcpu));
+ kvm_cpu_vmxoff();
+ }
+}
+
+static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
+{
+ ulong cr0;
+
+ if (vcpu->fpu_active)
+ return;
+ vcpu->fpu_active = 1;
+ cr0 = vmcs_readl(GUEST_CR0);
+ cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
+ cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
+ vmcs_writel(GUEST_CR0, cr0);
+ update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+}
+
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
+
+static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
+{
+ vmx_decache_cr0_guest_bits(vcpu);
+ vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
+ update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits = 0;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+ vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+}
+
+static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags, save_rflags;
+
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ }
+ return rflags;
+}
+
+static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ to_vmx(vcpu)->rmode.save_rflags = rflags;
+ rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+ }
+ vmcs_writel(GUEST_RFLAGS, rflags);
+}
+
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ int ret = 0;
+
+ if (interruptibility & GUEST_INTR_STATE_STI)
+ ret |= KVM_X86_SHADOW_INT_STI;
+ if (interruptibility & GUEST_INTR_STATE_MOV_SS)
+ ret |= KVM_X86_SHADOW_INT_MOV_SS;
+
+ return ret & mask;
+}
+
+static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ u32 interruptibility = interruptibility_old;
+
+ interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+
+ if (mask & KVM_X86_SHADOW_INT_MOV_SS)
+ interruptibility |= GUEST_INTR_STATE_MOV_SS;
+ else if (mask & KVM_X86_SHADOW_INT_STI)
+ interruptibility |= GUEST_INTR_STATE_STI;
+
+ if ((interruptibility != interruptibility_old))
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip;
+
+ rip = kvm_rip_read(vcpu);
+ rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ kvm_rip_write(vcpu, rip);
+
+ /* skipping an emulated instruction also counts */
+ vmx_set_interrupt_shadow(vcpu, 0);
+}
+
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /* Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set, then
+ * the instruction is already executing and RIP has already been
+ * advanced. */
+ if (!yield_on_hlt &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
+static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code,
+ bool reinject)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+ if (has_error_code) {
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (vmx->rmode.vm86_active) {
+ if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+
+ if (kvm_exception_is_soft(nr)) {
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ } else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+ vmx_clear_hlt(vcpu);
+}
+
+static bool vmx_rdtscp_supported(void)
+{
+ return cpu_has_vmx_rdtscp();
+}
+
+/*
+ * Swap MSR entry in host/guest MSR entry array.
+ */
+static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+{
+ struct shared_msr_entry tmp;
+
+ tmp = vmx->guest_msrs[to];
+ vmx->guest_msrs[to] = vmx->guest_msrs[from];
+ vmx->guest_msrs[from] = tmp;
+}
+
+/*
+ * Set up the vmcs to automatically save and restore system
+ * msrs. Don't touch the 64-bit msrs if the guest is in legacy
+ * mode, as fiddling with msrs is very expensive.
+ */
+static void setup_msrs(struct vcpu_vmx *vmx)
+{
+ int save_nmsrs, index;
+ unsigned long *msr_bitmap;
+
+ vmx_load_host_state(vmx);
+ save_nmsrs = 0;
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu)) {
+ index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_LSTAR);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_CSTAR);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_TSC_AUX);
+ if (index >= 0 && vmx->rdtscp_enabled)
+ move_msr_up(vmx, index, save_nmsrs++);
+ /*
+ * MSR_STAR is only needed on long mode guests, and only
+ * if efer.sce is enabled.
+ */
+ index = __find_msr_index(vmx, MSR_STAR);
+ if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
+ move_msr_up(vmx, index, save_nmsrs++);
+ }
+#endif
+ index = __find_msr_index(vmx, MSR_EFER);
+ if (index >= 0 && update_transition_efer(vmx, index))
+ move_msr_up(vmx, index, save_nmsrs++);
+
+ vmx->save_nmsrs = save_nmsrs;
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ if (is_long_mode(&vmx->vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy;
+
+ vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+ }
+}
+
+/*
+ * reads and returns guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset -- 21.3
+ */
+static u64 guest_read_tsc(void)
+{
+ u64 host_tsc, tsc_offset;
+
+ rdtscll(host_tsc);
+ tsc_offset = vmcs_read64(TSC_OFFSET);
+ return host_tsc + tsc_offset;
+}
+
+/*
+ * writes 'offset' into guest's timestamp counter offset register
+ */
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ vmcs_write64(TSC_OFFSET, offset);
+}
+
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ u64 offset = vmcs_read64(TSC_OFFSET);
+ vmcs_write64(TSC_OFFSET, offset + adjustment);
+}
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+ u64 data;
+ struct shared_msr_entry *msr;
+
+ if (!pdata) {
+ printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
+ return -EINVAL;
+ }
+
+ switch (msr_index) {
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ data = vmcs_readl(GUEST_FS_BASE);
+ break;
+ case MSR_GS_BASE:
+ data = vmcs_readl(GUEST_GS_BASE);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_load_host_state(to_vmx(vcpu));
+ data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+ break;
+#endif
+ case MSR_EFER:
+ return kvm_get_msr_common(vcpu, msr_index, pdata);
+ case MSR_IA32_TSC:
+ data = guest_read_tsc();
+ break;
+ case MSR_IA32_SYSENTER_CS:
+ data = vmcs_read32(GUEST_SYSENTER_CS);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ data = vmcs_readl(GUEST_SYSENTER_EIP);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ data = vmcs_readl(GUEST_SYSENTER_ESP);
+ break;
+ case MSR_TSC_AUX:
+ if (!to_vmx(vcpu)->rdtscp_enabled)
+ return 1;
+ /* Otherwise falls through */
+ default:
+ vmx_load_host_state(to_vmx(vcpu));
+ msr = find_msr_entry(to_vmx(vcpu), msr_index);
+ if (msr) {
+ vmx_load_host_state(to_vmx(vcpu));
+ data = msr->data;
+ break;
+ }
+ return kvm_get_msr_common(vcpu, msr_index, pdata);
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct shared_msr_entry *msr;
+ int ret = 0;
+
+ switch (msr_index) {
+ case MSR_EFER:
+ vmx_load_host_state(vmx);
+ ret = kvm_set_msr_common(vcpu, msr_index, data);
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ vmcs_writel(GUEST_FS_BASE, data);
+ break;
+ case MSR_GS_BASE:
+ vmcs_writel(GUEST_GS_BASE, data);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_load_host_state(vmx);
+ vmx->msr_guest_kernel_gs_base = data;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ vmcs_write32(GUEST_SYSENTER_CS, data);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ vmcs_writel(GUEST_SYSENTER_EIP, data);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ vmcs_writel(GUEST_SYSENTER_ESP, data);
+ break;
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, data);
+ break;
+ case MSR_IA32_CR_PAT:
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, data);
+ vcpu->arch.pat = data;
+ break;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_index, data);
+ break;
+ case MSR_TSC_AUX:
+ if (!vmx->rdtscp_enabled)
+ return 1;
+ /* Check reserved bit, higher 32 bits should be zero */
+ if ((data >> 32) != 0)
+ return 1;
+ /* Otherwise falls through */
+ default:
+ msr = find_msr_entry(vmx, msr_index);
+ if (msr) {
+ vmx_load_host_state(vmx);
+ msr->data = data;
+ break;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_index, data);
+ }
+
+ return ret;
+}
+
+static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ switch (reg) {
+ case VCPU_REGS_RSP:
+ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+ break;
+ case VCPU_REGS_RIP:
+ vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+ break;
+ case VCPU_EXREG_PDPTR:
+ if (enable_ept)
+ ept_save_pdptrs(vcpu);
+ break;
+ default:
+ break;
+ }
+}
+
+static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
+ else
+ vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+
+ update_exception_bitmap(vcpu);
+}
+
+static __init int cpu_has_kvm_support(void)
+{
+ return cpu_has_vmx();
+}
+
+static __init int vmx_disabled_by_bios(void)
+{
+ u64 msr;
+
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+ if (msr & FEATURE_CONTROL_LOCKED) {
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
+ && kvm_tboot_enabled())
+ return 1;
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+ && !kvm_tboot_enabled()) {
+ printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
+ " activate TXT before enabling KVM\n");
+ return 1;
+ }
+ }
+
+ return 0;
+ /* locked but not enabled */
+}
+
+static void kvm_cpu_vmxon(u64 addr)
+{
+ asm volatile (ASM_VMX_VMXON_RAX
+ : : "a"(&addr), "m"(addr)
+ : "memory", "cc");
+}
+
+static int hardware_enable(void *garbage)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ u64 old, test_bits;
+
+ if (read_cr4() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
+
+ test_bits = FEATURE_CONTROL_LOCKED;
+ test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ if (kvm_tboot_enabled())
+ test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
+
+ if ((old & test_bits) != test_bits) {
+ /* enable and lock */
+ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
+ }
+ write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+
+ if (vmm_exclusive) {
+ kvm_cpu_vmxon(phys_addr);
+ ept_sync_global();
+ }
+
+ kvm_store_gdt(&__get_cpu_var(host_gdt));
+
+ return 0;
+}
+
+static void vmclear_local_vcpus(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct vcpu_vmx *vmx, *n;
+
+ list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
+ local_vcpus_link)
+ __vcpu_clear(vmx);
+}
+
+
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
+{
+ asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+}
+
+static void hardware_disable(void *garbage)
+{
+ if (vmm_exclusive) {
+ vmclear_local_vcpus();
+ kvm_cpu_vmxoff();
+ }
+ write_cr4(read_cr4() & ~X86_CR4_VMXE);
+}
+
+static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
+ u32 msr, u32 *result)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+ u32 ctl = ctl_min | ctl_opt;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+ ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
+
+ /* Ensure minimum (required) set of control bits are supported. */
+ if (ctl_min & ~ctl)
+ return -EIO;
+
+ *result = ctl;
+ return 0;
+}
+
+static __init bool allow_1_setting(u32 msr, u32 ctl)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+ return vmx_msr_high & ctl;
+}
+
+static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+ u32 min, opt, min2, opt2;
+ u32 _pin_based_exec_control = 0;
+ u32 _cpu_based_exec_control = 0;
+ u32 _cpu_based_2nd_exec_control = 0;
+ u32 _vmexit_control = 0;
+ u32 _vmentry_control = 0;
+
+ min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+ opt = PIN_BASED_VIRTUAL_NMIS;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+ &_pin_based_exec_control) < 0)
+ return -EIO;
+
+ min =
+#ifdef CONFIG_X86_64
+ CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING |
+#endif
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_USE_IO_BITMAPS |
+ CPU_BASED_MOV_DR_EXITING |
+ CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING |
+ CPU_BASED_INVLPG_EXITING;
+
+ if (yield_on_hlt)
+ min |= CPU_BASED_HLT_EXITING;
+
+ opt = CPU_BASED_TPR_SHADOW |
+ CPU_BASED_USE_MSR_BITMAPS |
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
+ &_cpu_based_exec_control) < 0)
+ return -EIO;
+#ifdef CONFIG_X86_64
+ if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
+ ~CPU_BASED_CR8_STORE_EXITING;
+#endif
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+ min2 = 0;
+ opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_WBINVD_EXITING |
+ SECONDARY_EXEC_ENABLE_VPID |
+ SECONDARY_EXEC_ENABLE_EPT |
+ SECONDARY_EXEC_UNRESTRICTED_GUEST |
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING |
+ SECONDARY_EXEC_RDTSCP;
+ if (adjust_vmx_controls(min2, opt2,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ &_cpu_based_2nd_exec_control) < 0)
+ return -EIO;
+ }
+#ifndef CONFIG_X86_64
+ if (!(_cpu_based_2nd_exec_control &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+#endif
+ if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
+ /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
+ enabled */
+ _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
+ rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
+ vmx_capability.ept, vmx_capability.vpid);
+ }
+
+ min = 0;
+#ifdef CONFIG_X86_64
+ min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+ opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
+ &_vmexit_control) < 0)
+ return -EIO;
+
+ min = 0;
+ opt = VM_ENTRY_LOAD_IA32_PAT;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
+ &_vmentry_control) < 0)
+ return -EIO;
+
+ rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+
+ /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+ if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ return -EIO;
+
+#ifdef CONFIG_X86_64
+ /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
+ if (vmx_msr_high & (1u<<16))
+ return -EIO;
+#endif
+
+ /* Require Write-Back (WB) memory type for VMCS accesses. */
+ if (((vmx_msr_high >> 18) & 15) != 6)
+ return -EIO;
+
+ vmcs_conf->size = vmx_msr_high & 0x1fff;
+ vmcs_conf->order = get_order(vmcs_config.size);
+ vmcs_conf->revision_id = vmx_msr_low;
+
+ vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
+ vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+ vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+ vmcs_conf->vmexit_ctrl = _vmexit_control;
+ vmcs_conf->vmentry_ctrl = _vmentry_control;
+
+ cpu_has_load_ia32_efer =
+ allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+ VM_ENTRY_LOAD_IA32_EFER)
+ && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+ VM_EXIT_LOAD_IA32_EFER);
+
+ return 0;
+}
+
+static struct vmcs *alloc_vmcs_cpu(int cpu)
+{
+ int node = cpu_to_node(cpu);
+ struct page *pages;
+ struct vmcs *vmcs;
+
+ pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
+ if (!pages)
+ return NULL;
+ vmcs = page_address(pages);
+ memset(vmcs, 0, vmcs_config.size);
+ vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
+ return vmcs;
+}
+
+static struct vmcs *alloc_vmcs(void)
+{
+ return alloc_vmcs_cpu(raw_smp_processor_id());
+}
+
+static void free_vmcs(struct vmcs *vmcs)
+{
+ free_pages((unsigned long)vmcs, vmcs_config.order);
+}
+
+static void free_kvm_area(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ free_vmcs(per_cpu(vmxarea, cpu));
+ per_cpu(vmxarea, cpu) = NULL;
+ }
+}
+
+static __init int alloc_kvm_area(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct vmcs *vmcs;
+
+ vmcs = alloc_vmcs_cpu(cpu);
+ if (!vmcs) {
+ free_kvm_area();
+ return -ENOMEM;
+ }
+
+ per_cpu(vmxarea, cpu) = vmcs;
+ }
+ return 0;
+}
+
+static __init int hardware_setup(void)
+{
+ if (setup_vmcs_config(&vmcs_config) < 0)
+ return -EIO;
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
+ if (!cpu_has_vmx_vpid())
+ enable_vpid = 0;
+
+ if (!cpu_has_vmx_ept() ||
+ !cpu_has_vmx_ept_4levels()) {
+ enable_ept = 0;
+ enable_unrestricted_guest = 0;
+ }
+
+ if (!cpu_has_vmx_unrestricted_guest())
+ enable_unrestricted_guest = 0;
+
+ if (!cpu_has_vmx_flexpriority())
+ flexpriority_enabled = 0;
+
+ if (!cpu_has_vmx_tpr_shadow())
+ kvm_x86_ops->update_cr8_intercept = NULL;
+
+ if (enable_ept && !cpu_has_vmx_ept_2m_page())
+ kvm_disable_largepages();
+
+ if (!cpu_has_vmx_ple())
+ ple_gap = 0;
+
+ return alloc_kvm_area();
+}
+
+static __exit void hardware_unsetup(void)
+{
+ free_kvm_area();
+}
+
+static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
+{
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+ if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
+ vmcs_write16(sf->selector, save->selector);
+ vmcs_writel(sf->base, save->base);
+ vmcs_write32(sf->limit, save->limit);
+ vmcs_write32(sf->ar_bytes, save->ar);
+ } else {
+ u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
+ << AR_DPL_SHIFT;
+ vmcs_write32(sf->ar_bytes, 0x93 | dpl);
+ }
+}
+
+static void enter_pmode(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->emulation_required = 1;
+ vmx->rmode.vm86_active = 0;
+
+ vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
+ vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
+ vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
+
+ flags = vmcs_readl(GUEST_RFLAGS);
+ flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ vmcs_writel(GUEST_RFLAGS, flags);
+
+ vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
+ (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
+
+ update_exception_bitmap(vcpu);
+
+ if (emulate_invalid_guest_state)
+ return;
+
+ fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
+ fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
+ fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
+ fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
+
+ vmcs_write16(GUEST_SS_SELECTOR, 0);
+ vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
+
+ vmcs_write16(GUEST_CS_SELECTOR,
+ vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
+ vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+}
+
+static gva_t rmode_tss_base(struct kvm *kvm)
+{
+ if (!kvm->arch.tss_addr) {
+ struct kvm_memslots *slots;
+ gfn_t base_gfn;
+
+ slots = kvm_memslots(kvm);
+ base_gfn = slots->memslots[0].base_gfn +
+ kvm->memslots->memslots[0].npages - 3;
+ return base_gfn << PAGE_SHIFT;
+ }
+ return kvm->arch.tss_addr;
+}
+
+static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+{
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+ save->selector = vmcs_read16(sf->selector);
+ save->base = vmcs_readl(sf->base);
+ save->limit = vmcs_read32(sf->limit);
+ save->ar = vmcs_read32(sf->ar_bytes);
+ vmcs_write16(sf->selector, save->base >> 4);
+ vmcs_write32(sf->base, save->base & 0xffff0);
+ vmcs_write32(sf->limit, 0xffff);
+ vmcs_write32(sf->ar_bytes, 0xf3);
+ if (save->base & 0xf)
+ printk_once(KERN_WARNING "kvm: segment base is not paragraph"
+ " aligned when entering protected mode (seg=%d)",
+ seg);
+}
+
+static void enter_rmode(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (enable_unrestricted_guest)
+ return;
+
+ vmx->emulation_required = 1;
+ vmx->rmode.vm86_active = 1;
+
+ vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+ vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
+
+ vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
+
+ vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+ flags = vmcs_readl(GUEST_RFLAGS);
+ vmx->rmode.save_rflags = flags;
+
+ flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+
+ vmcs_writel(GUEST_RFLAGS, flags);
+ vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
+ update_exception_bitmap(vcpu);
+
+ if (emulate_invalid_guest_state)
+ goto continue_rmode;
+
+ vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
+ vmcs_write32(GUEST_SS_LIMIT, 0xffff);
+ vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+
+ vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
+ vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+ if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
+ vmcs_writel(GUEST_CS_BASE, 0xf0000);
+ vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
+
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+
+continue_rmode:
+ kvm_mmu_reset_context(vcpu);
+ init_rmode(vcpu->kvm);
+}
+
+static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+ if (!msr)
+ return;
+
+ /*
+ * Force kernel_gs_base reloading before EFER changes, as control
+ * of this msr depends on is_long_mode().
+ */
+ vmx_load_host_state(to_vmx(vcpu));
+ vcpu->arch.efer = efer;
+ if (efer & EFER_LMA) {
+ vmcs_write32(VM_ENTRY_CONTROLS,
+ vmcs_read32(VM_ENTRY_CONTROLS) |
+ VM_ENTRY_IA32E_MODE);
+ msr->data = efer;
+ } else {
+ vmcs_write32(VM_ENTRY_CONTROLS,
+ vmcs_read32(VM_ENTRY_CONTROLS) &
+ ~VM_ENTRY_IA32E_MODE);
+
+ msr->data = efer & ~EFER_LME;
+ }
+ setup_msrs(vmx);
+}
+
+#ifdef CONFIG_X86_64
+
+static void enter_lmode(struct kvm_vcpu *vcpu)
+{
+ u32 guest_tr_ar;
+
+ guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
+ if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
+ printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
+ __func__);
+ vmcs_write32(GUEST_TR_AR_BYTES,
+ (guest_tr_ar & ~AR_TYPE_MASK)
+ | AR_TYPE_BUSY_64_TSS);
+ }
+ vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
+}
+
+static void exit_lmode(struct kvm_vcpu *vcpu)
+{
+ vmcs_write32(VM_ENTRY_CONTROLS,
+ vmcs_read32(VM_ENTRY_CONTROLS)
+ & ~VM_ENTRY_IA32E_MODE);
+ vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
+}
+
+#endif
+
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ vpid_sync_context(to_vmx(vcpu));
+ if (enable_ept) {
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+ }
+}
+
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+ ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+ vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
+ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
+}
+
+static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+{
+ if (enable_ept && is_paging(vcpu))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+}
+
+static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+ ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+ vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
+}
+
+static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty))
+ return;
+
+ if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+ vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
+ vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
+ vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
+ vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
+ }
+}
+
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
+{
+ if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+ vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+ }
+
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+
+static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
+ unsigned long cr0,
+ struct kvm_vcpu *vcpu)
+{
+ vmx_decache_cr3(vcpu);
+ if (!(cr0 & X86_CR0_PG)) {
+ /* From paging/starting to nonpaging */
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+ vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
+ (CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING));
+ vcpu->arch.cr0 = cr0;
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ } else if (!is_paging(vcpu)) {
+ /* From nonpaging to paging */
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+ vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+ ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING));
+ vcpu->arch.cr0 = cr0;
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
+
+ if (!(cr0 & X86_CR0_WP))
+ *hw_cr0 &= ~X86_CR0_WP;
+}
+
+static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr0;
+
+ if (enable_unrestricted_guest)
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
+ | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
+
+ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
+ enter_pmode(vcpu);
+
+ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
+ enter_rmode(vcpu);
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+ enter_lmode(vcpu);
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+ exit_lmode(vcpu);
+ }
+#endif
+
+ if (enable_ept)
+ ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+
+ if (!vcpu->fpu_active)
+ hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
+
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+}
+
+static u64 construct_eptp(unsigned long root_hpa)
+{
+ u64 eptp;
+
+ /* TODO write the value reading from MSR */
+ eptp = VMX_EPT_DEFAULT_MT |
+ VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+ eptp |= (root_hpa & PAGE_MASK);
+
+ return eptp;
+}
+
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ unsigned long guest_cr3;
+ u64 eptp;
+
+ guest_cr3 = cr3;
+ if (enable_ept) {
+ eptp = construct_eptp(cr3);
+ vmcs_write64(EPT_POINTER, eptp);
+ guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
+ vcpu->kvm->arch.ept_identity_map_addr;
+ ept_load_pdptrs(vcpu);
+ }
+
+ vmx_flush_tlb(vcpu);
+ vmcs_writel(GUEST_CR3, guest_cr3);
+}
+
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
+ KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+
+ vcpu->arch.cr4 = cr4;
+ if (enable_ept) {
+ if (!is_paging(vcpu)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ hw_cr4 |= X86_CR4_PSE;
+ } else if (!(cr4 & X86_CR4_PAE)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ }
+ }
+
+ vmcs_writel(CR4_READ_SHADOW, cr4);
+ vmcs_writel(GUEST_CR4, hw_cr4);
+}
+
+static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+ return vmcs_readl(sf->base);
+}
+
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ u32 ar;
+
+ var->base = vmcs_readl(sf->base);
+ var->limit = vmcs_read32(sf->limit);
+ var->selector = vmcs_read16(sf->selector);
+ ar = vmcs_read32(sf->ar_bytes);
+ if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
+ ar = 0;
+ var->type = ar & 15;
+ var->s = (ar >> 4) & 1;
+ var->dpl = (ar >> 5) & 3;
+ var->present = (ar >> 7) & 1;
+ var->avl = (ar >> 12) & 1;
+ var->l = (ar >> 13) & 1;
+ var->db = (ar >> 14) & 1;
+ var->g = (ar >> 15) & 1;
+ var->unusable = (ar >> 16) & 1;
+}
+
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ if (!is_protmode(vcpu))
+ return 0;
+
+ if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
+ return 3;
+
+ return vmcs_read16(GUEST_CS_SELECTOR) & 3;
+}
+
+static u32 vmx_segment_access_rights(struct kvm_segment *var)
+{
+ u32 ar;
+
+ if (var->unusable)
+ ar = 1 << 16;
+ else {
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ }
+ if (ar == 0) /* a 0 value means unusable */
+ ar = AR_UNUSABLE_MASK;
+
+ return ar;
+}
+
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ u32 ar;
+
+ if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
+ vmx->rmode.tr.selector = var->selector;
+ vmx->rmode.tr.base = var->base;
+ vmx->rmode.tr.limit = var->limit;
+ vmx->rmode.tr.ar = vmx_segment_access_rights(var);
+ return;
+ }
+ vmcs_writel(sf->base, var->base);
+ vmcs_write32(sf->limit, var->limit);
+ vmcs_write16(sf->selector, var->selector);
+ if (vmx->rmode.vm86_active && var->s) {
+ /*
+ * Hack real-mode segments into vm86 compatibility.
+ */
+ if (var->base == 0xffff0000 && var->selector == 0xf000)
+ vmcs_writel(sf->base, 0xf0000);
+ ar = 0xf3;
+ } else
+ ar = vmx_segment_access_rights(var);
+
+ /*
+ * Fix the "Accessed" bit in AR field of segment registers for older
+ * qemu binaries.
+ * IA32 arch specifies that at the time of processor reset the
+ * "Accessed" bit in the AR field of segment registers is 1. And qemu
+ * is setting it to 0 in the usedland code. This causes invalid guest
+ * state vmexit when "unrestricted guest" mode is turned on.
+ * Fix for this setup issue in cpu_reset is being pushed in the qemu
+ * tree. Newer qemu binaries with that qemu fix would not need this
+ * kvm hack.
+ */
+ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ ar |= 0x1; /* Accessed */
+
+ vmcs_write32(sf->ar_bytes, ar);
+}
+
+static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
+
+ *db = (ar >> 14) & 1;
+ *l = (ar >> 13) & 1;
+}
+
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+{
+ dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_IDTR_BASE);
+}
+
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+{
+ vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_IDTR_BASE, dt->address);
+}
+
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+{
+ dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_GDTR_BASE);
+}
+
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+{
+ vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_GDTR_BASE, dt->address);
+}
+
+static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ u32 ar;
+
+ vmx_get_segment(vcpu, &var, seg);
+ ar = vmx_segment_access_rights(&var);
+
+ if (var.base != (var.selector << 4))
+ return false;
+ if (var.limit != 0xffff)
+ return false;
+ if (ar != 0xf3)
+ return false;
+
+ return true;
+}
+
+static bool code_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ unsigned int cs_rpl;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs_rpl = cs.selector & SELECTOR_RPL_MASK;
+
+ if (cs.unusable)
+ return false;
+ if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
+ return false;
+ if (!cs.s)
+ return false;
+ if (cs.type & AR_TYPE_WRITEABLE_MASK) {
+ if (cs.dpl > cs_rpl)
+ return false;
+ } else {
+ if (cs.dpl != cs_rpl)
+ return false;
+ }
+ if (!cs.present)
+ return false;
+
+ /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
+ return true;
+}
+
+static bool stack_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ss;
+ unsigned int ss_rpl;
+
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+ ss_rpl = ss.selector & SELECTOR_RPL_MASK;
+
+ if (ss.unusable)
+ return true;
+ if (ss.type != 3 && ss.type != 7)
+ return false;
+ if (!ss.s)
+ return false;
+ if (ss.dpl != ss_rpl) /* DPL != RPL */
+ return false;
+ if (!ss.present)
+ return false;
+
+ return true;
+}
+
+static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ unsigned int rpl;
+
+ vmx_get_segment(vcpu, &var, seg);
+ rpl = var.selector & SELECTOR_RPL_MASK;
+
+ if (var.unusable)
+ return true;
+ if (!var.s)
+ return false;
+ if (!var.present)
+ return false;
+ if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
+ if (var.dpl < rpl) /* DPL < RPL */
+ return false;
+ }
+
+ /* TODO: Add other members to kvm_segment_field to allow checking for other access
+ * rights flags
+ */
+ return true;
+}
+
+static bool tr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment tr;
+
+ vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
+
+ if (tr.unusable)
+ return false;
+ if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
+ return false;
+ if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
+ return false;
+ if (!tr.present)
+ return false;
+
+ return true;
+}
+
+static bool ldtr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ldtr;
+
+ vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
+
+ if (ldtr.unusable)
+ return true;
+ if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
+ return false;
+ if (ldtr.type != 2)
+ return false;
+ if (!ldtr.present)
+ return false;
+
+ return true;
+}
+
+static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ss;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+
+ return ((cs.selector & SELECTOR_RPL_MASK) ==
+ (ss.selector & SELECTOR_RPL_MASK));
+}
+
+/*
+ * Check if guest state is valid. Returns true if valid, false if
+ * not.
+ * We assume that registers are always usable
+ */
+static bool guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ /* real mode guest state checks */
+ if (!is_protmode(vcpu)) {
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ } else {
+ /* protected mode guest state checks */
+ if (!cs_ss_rpl_check(vcpu))
+ return false;
+ if (!code_segment_valid(vcpu))
+ return false;
+ if (!stack_segment_valid(vcpu))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ if (!tr_valid(vcpu))
+ return false;
+ if (!ldtr_valid(vcpu))
+ return false;
+ }
+ /* TODO:
+ * - Add checks on RIP
+ * - Add checks on RFLAGS
+ */
+
+ return true;
+}
+
+static int init_rmode_tss(struct kvm *kvm)
+{
+ gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
+ u16 data = 0;
+ int ret = 0;
+ int r;
+
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+ r = kvm_write_guest_page(kvm, fn++, &data,
+ TSS_IOPB_BASE_OFFSET, sizeof(u16));
+ if (r < 0)
+ goto out;
+ r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ data = ~0;
+ r = kvm_write_guest_page(kvm, fn, &data,
+ RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+ sizeof(u8));
+ if (r < 0)
+ goto out;
+
+ ret = 1;
+out:
+ return ret;
+}
+
+static int init_rmode_identity_map(struct kvm *kvm)
+{
+ int i, r, ret;
+ pfn_t identity_map_pfn;
+ u32 tmp;
+
+ if (!enable_ept)
+ return 1;
+ if (unlikely(!kvm->arch.ept_identity_pagetable)) {
+ printk(KERN_ERR "EPT: identity-mapping pagetable "
+ "haven't been allocated!\n");
+ return 0;
+ }
+ if (likely(kvm->arch.ept_identity_pagetable_done))
+ return 1;
+ ret = 0;
+ identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
+ r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ /* Set up identity-mapping pagetable for EPT in real mode */
+ for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+ tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ r = kvm_write_guest_page(kvm, identity_map_pfn,
+ &tmp, i * sizeof(tmp), sizeof(tmp));
+ if (r < 0)
+ goto out;
+ }
+ kvm->arch.ept_identity_pagetable_done = true;
+ ret = 1;
+out:
+ return ret;
+}
+
+static void seg_setup(int seg)
+{
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ unsigned int ar;
+
+ vmcs_write16(sf->selector, 0);
+ vmcs_writel(sf->base, 0);
+ vmcs_write32(sf->limit, 0xffff);
+ if (enable_unrestricted_guest) {
+ ar = 0x93;
+ if (seg == VCPU_SREG_CS)
+ ar |= 0x08; /* code segment */
+ } else
+ ar = 0xf3;
+
+ vmcs_write32(sf->ar_bytes, ar);
+}
+
+static int alloc_apic_access_page(struct kvm *kvm)
+{
+ struct kvm_userspace_memory_region kvm_userspace_mem;
+ int r = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.apic_access_page)
+ goto out;
+ kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
+ kvm_userspace_mem.flags = 0;
+ kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
+ kvm_userspace_mem.memory_size = PAGE_SIZE;
+ r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ if (r)
+ goto out;
+
+ kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static int alloc_identity_pagetable(struct kvm *kvm)
+{
+ struct kvm_userspace_memory_region kvm_userspace_mem;
+ int r = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.ept_identity_pagetable)
+ goto out;
+ kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
+ kvm_userspace_mem.flags = 0;
+ kvm_userspace_mem.guest_phys_addr =
+ kvm->arch.ept_identity_map_addr;
+ kvm_userspace_mem.memory_size = PAGE_SIZE;
+ r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ if (r)
+ goto out;
+
+ kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
+ kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static void allocate_vpid(struct vcpu_vmx *vmx)
+{
+ int vpid;
+
+ vmx->vpid = 0;
+ if (!enable_vpid)
+ return;
+ spin_lock(&vmx_vpid_lock);
+ vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+ if (vpid < VMX_NR_VPIDS) {
+ vmx->vpid = vpid;
+ __set_bit(vpid, vmx_vpid_bitmap);
+ }
+ spin_unlock(&vmx_vpid_lock);
+}
+
+static void free_vpid(struct vcpu_vmx *vmx)
+{
+ if (!enable_vpid)
+ return;
+ spin_lock(&vmx_vpid_lock);
+ if (vmx->vpid != 0)
+ __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+ spin_unlock(&vmx_vpid_lock);
+}
+
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
+ __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
+ __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
+ }
+}
+
+static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+{
+ if (!longmode_only)
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
+}
+
+/*
+ * Sets up the vmcs for emulated real mode.
+ */
+static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+{
+ u32 host_sysenter_cs, msr_low, msr_high;
+ u32 junk;
+ u64 host_pat;
+ unsigned long a;
+ struct kvm_desc_ptr dt;
+ int i;
+ unsigned long kvm_vmx_return;
+ u32 exec_control;
+
+ /* I/O */
+ vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
+ vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
+
+ vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+ /* Control */
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+ vmcs_config.pin_based_exec_ctrl);
+
+ exec_control = vmcs_config.cpu_based_exec_ctrl;
+ if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_STORE_EXITING |
+ CPU_BASED_CR8_LOAD_EXITING;
+#endif
+ }
+ if (!enable_ept)
+ exec_control |= CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_INVLPG_EXITING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+ if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ exec_control &=
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (vmx->vpid == 0)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+ if (!enable_ept) {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ enable_unrestricted_guest = 0;
+ }
+ if (!enable_unrestricted_guest)
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (!ple_gap)
+ exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ }
+
+ if (ple_gap) {
+ vmcs_write32(PLE_GAP, ple_gap);
+ vmcs_write32(PLE_WINDOW, ple_window);
+ }
+
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+ vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
+
+ vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
+ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
+ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
+
+ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
+ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
+ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
+ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_FS_BASE, a);
+ vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
+ rdmsrl(MSR_GS_BASE, a);
+ vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
+#else
+ vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+#endif
+
+ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
+
+ kvm_native_store_idt(&dt);
+ vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
+
+ asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
+ vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+
+ rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
+ vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
+ rdmsrl(MSR_IA32_SYSENTER_ESP, a);
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
+ rdmsrl(MSR_IA32_SYSENTER_EIP, a);
+ vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
+
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+ host_pat = msr_low | ((u64) msr_high << 32);
+ vmcs_write64(HOST_IA32_PAT, host_pat);
+ }
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+ host_pat = msr_low | ((u64) msr_high << 32);
+ /* Write the default value follow host pat */
+ vmcs_write64(GUEST_IA32_PAT, host_pat);
+ /* Keep arch.pat sync with GUEST_IA32_PAT */
+ vmx->vcpu.arch.pat = host_pat;
+ }
+
+ for (i = 0; i < NR_VMX_MSR; ++i) {
+ u32 index = vmx_msr_index[i];
+ u32 data_low, data_high;
+ int j = vmx->nmsrs;
+
+ if (rdmsr_safe(index, &data_low, &data_high) < 0)
+ continue;
+ if (wrmsr_safe(index, data_low, data_high) < 0)
+ continue;
+ vmx->guest_msrs[j].index = i;
+ vmx->guest_msrs[j].data = 0;
+ vmx->guest_msrs[j].mask = -1ull;
+ ++vmx->nmsrs;
+ }
+
+ vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+ /* 22.2.1, 20.8.1 */
+ vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
+
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+ vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+ if (enable_ept)
+ vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+
+ kvm_write_tsc(&vmx->vcpu, 0);
+
+ return 0;
+}
+
+static int init_rmode(struct kvm *kvm)
+{
+ int idx, ret = 0;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ if (!init_rmode_tss(kvm))
+ goto exit;
+ if (!init_rmode_identity_map(kvm))
+ goto exit;
+
+ ret = 1;
+exit:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return ret;
+}
+
+static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 msr;
+ int ret;
+
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
+ if (!init_rmode(vmx->vcpu.kvm)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ vmx->rmode.vm86_active = 0;
+
+ vmx->soft_vnmi_blocked = 0;
+
+ vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+ kvm_set_cr8(&vmx->vcpu, 0);
+ msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_bsp(&vmx->vcpu))
+ msr |= MSR_IA32_APICBASE_BSP;
+ kvm_set_apic_base(&vmx->vcpu, msr);
+
+ ret = fx_init(&vmx->vcpu);
+ if (ret != 0)
+ goto out;
+
+ seg_setup(VCPU_SREG_CS);
+ /*
+ * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
+ * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
+ */
+ if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
+ vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+ vmcs_writel(GUEST_CS_BASE, 0x000f0000);
+ } else {
+ vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
+ vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
+ }
+
+ seg_setup(VCPU_SREG_DS);
+ seg_setup(VCPU_SREG_ES);
+ seg_setup(VCPU_SREG_FS);
+ seg_setup(VCPU_SREG_GS);
+ seg_setup(VCPU_SREG_SS);
+
+ vmcs_write16(GUEST_TR_SELECTOR, 0);
+ vmcs_writel(GUEST_TR_BASE, 0);
+ vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+ vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+ vmcs_writel(GUEST_LDTR_BASE, 0);
+ vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
+
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+
+ vmcs_writel(GUEST_RFLAGS, 0x02);
+ if (kvm_vcpu_is_bsp(&vmx->vcpu))
+ kvm_rip_write(vcpu, 0xfff0);
+ else
+ kvm_rip_write(vcpu, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
+
+ vmcs_writel(GUEST_DR7, 0x400);
+
+ vmcs_writel(GUEST_GDTR_BASE, 0);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+ vmcs_writel(GUEST_IDTR_BASE, 0);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+ vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+
+ /* Special registers */
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ setup_msrs(vmx);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ page_to_phys(vmx->vcpu.arch.apic->regs_page));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
+
+ if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ vmcs_write64(APIC_ACCESS_ADDR,
+ page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
+
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
+ vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+ vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
+ vmx_set_cr4(&vmx->vcpu, 0);
+ vmx_set_efer(&vmx->vcpu, 0);
+ vmx_fpu_activate(&vmx->vcpu);
+ update_exception_bitmap(&vmx->vcpu);
+
+ vpid_sync_context(vmx);
+
+ ret = 0;
+
+ /* HACK: Don't enable emulation on guest boot/reset */
+ vmx->emulation_required = 0;
+
+out:
+ return ret;
+}
+
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ if (!cpu_has_virtual_nmis()) {
+ enable_irq_window(vcpu);
+ return;
+ }
+
+ if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ enable_irq_window(vcpu);
+ return;
+ }
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void vmx_inject_irq(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ uint32_t intr;
+ int irq = vcpu->arch.interrupt.nr;
+
+ trace_kvm_inj_virq(irq);
+
+ ++vcpu->stat.irq_injections;
+ if (vmx->rmode.vm86_active) {
+ if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+ intr = irq | INTR_INFO_VALID_MASK;
+ if (vcpu->arch.interrupt.soft) {
+ intr |= INTR_TYPE_SOFT_INTR;
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ } else
+ intr |= INTR_TYPE_EXT_INTR;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+ vmx_clear_hlt(vcpu);
+}
+
+static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!cpu_has_virtual_nmis()) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->soft_vnmi_blocked = 1;
+ vmx->vnmi_blocked_time = 0;
+ }
+
+ ++vcpu->stat.nmi_injections;
+ if (vmx->rmode.vm86_active) {
+ if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+ vmx_clear_hlt(vcpu);
+}
+
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+ if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+ return 0;
+
+ return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+ | GUEST_INTR_STATE_NMI));
+}
+
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ if (!cpu_has_virtual_nmis())
+ return to_vmx(vcpu)->soft_vnmi_blocked;
+ return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+}
+
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!cpu_has_virtual_nmis()) {
+ if (vmx->soft_vnmi_blocked != masked) {
+ vmx->soft_vnmi_blocked = masked;
+ vmx->vnmi_blocked_time = 0;
+ }
+ } else {
+ if (masked)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
+}
+
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
+static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ int ret;
+ struct kvm_userspace_memory_region tss_mem = {
+ .slot = TSS_PRIVATE_MEMSLOT,
+ .guest_phys_addr = addr,
+ .memory_size = PAGE_SIZE * 3,
+ .flags = 0,
+ };
+
+ ret = kvm_set_memory_region(kvm, &tss_mem, 0);
+ if (ret)
+ return ret;
+ kvm->arch.tss_addr = addr;
+ return 0;
+}
+
+static int handle_rmode_exception(struct kvm_vcpu *vcpu,
+ int vec, u32 err_code)
+{
+ /*
+ * Instruction with address size override prefix opcode 0x67
+ * Cause the #SS fault with 0 error code in VM86 mode.
+ */
+ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
+ if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
+ return 1;
+ /*
+ * Forward all other exceptions that are valid in real mode.
+ * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+ * the required debugging infrastructure rework.
+ */
+ switch (vec) {
+ case DB_VECTOR:
+ if (vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return 0;
+ kvm_queue_exception(vcpu, vec);
+ return 1;
+ case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject the exception
+ * from user space while in guest debugging mode.
+ */
+ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return 0;
+ /* fall through */
+ case DE_VECTOR:
+ case OF_VECTOR:
+ case BR_VECTOR:
+ case UD_VECTOR:
+ case DF_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ case MF_VECTOR:
+ kvm_queue_exception(vcpu, vec);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+ struct pt_regs regs = {
+ .kvm_pt_regs_cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .kvm_pt_regs_flags = X86_EFLAGS_IF,
+ };
+
+ kvm_do_machine_check(&regs, 0);
+#endif
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu)
+{
+ /* already handled by vcpu_run */
+ return 1;
+}
+
+static int handle_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 intr_info, ex_no, error_code;
+ unsigned long cr2, rip, dr6;
+ u32 vect_info;
+ enum emulation_result er;
+
+ vect_info = vmx->idt_vectoring_info;
+ intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ if (is_machine_check(intr_info))
+ return handle_machine_check(vcpu);
+
+ if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+ !is_page_fault(intr_info)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = vect_info;
+ vcpu->run->internal.data[1] = intr_info;
+ return 0;
+ }
+
+ if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+ return 1; /* already handled by vmx_vcpu_run() */
+
+ if (is_no_device(intr_info)) {
+ vmx_fpu_activate(vcpu);
+ return 1;
+ }
+
+ if (is_invalid_opcode(intr_info)) {
+ er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
+ if (er != EMULATE_DONE)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ error_code = 0;
+ rip = kvm_rip_read(vcpu);
+ if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ if (is_page_fault(intr_info)) {
+ /* EPT won't cause page fault directly */
+ if (enable_ept)
+ BUG();
+ cr2 = vmcs_readl(EXIT_QUALIFICATION);
+ trace_kvm_page_fault(cr2, error_code);
+
+ if (kvm_event_needs_reinjection(vcpu))
+ kvm_mmu_unprotect_page_virt(vcpu, cr2);
+ return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
+ }
+
+ if (vmx->rmode.vm86_active &&
+ handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
+ error_code)) {
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ return kvm_emulate_halt(vcpu);
+ }
+ return 1;
+ }
+
+ ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+ switch (ex_no) {
+ case DB_VECTOR:
+ dr6 = vmcs_readl(EXIT_QUALIFICATION);
+ if (!(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ return 1;
+ }
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+ /* fall through */
+ case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject #BP from
+ * user space while in guest debugging mode. Reading it for
+ * #DB as well causes no harm, it is not used in that case.
+ */
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+ kvm_run->debug.arch.exception = ex_no;
+ break;
+ default:
+ kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+ kvm_run->ex.exception = ex_no;
+ kvm_run->ex.error_code = error_code;
+ break;
+ }
+ return 0;
+}
+
+static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.irq_exits;
+ return 1;
+}
+
+static int handle_triple_fault(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+static int handle_io(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int size, in, string;
+ unsigned port;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ string = (exit_qualification & 16) != 0;
+ in = (exit_qualification & 8) != 0;
+
+ ++vcpu->stat.io_exits;
+
+ if (string || in)
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+ skip_emulated_instruction(vcpu);
+
+ return kvm_fast_pio_out(vcpu, size, port);
+}
+
+static void
+vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xc1;
+}
+
+static int handle_cr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification, val;
+ int cr;
+ int reg;
+ int err;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ cr = exit_qualification & 15;
+ reg = (exit_qualification >> 8) & 15;
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ val = kvm_register_read(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
+ switch (cr) {
+ case 0:
+ err = kvm_set_cr0(vcpu, val);
+ kvm_complete_insn_gp(vcpu, err);
+ return 1;
+ case 3:
+ err = kvm_set_cr3(vcpu, val);
+ kvm_complete_insn_gp(vcpu, err);
+ return 1;
+ case 4:
+ err = kvm_set_cr4(vcpu, val);
+ kvm_complete_insn_gp(vcpu, err);
+ return 1;
+ case 8: {
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ u8 cr8 = kvm_register_read(vcpu, reg);
+ err = kvm_set_cr8(vcpu, cr8);
+ kvm_complete_insn_gp(vcpu, err);
+ if (irqchip_in_kernel(vcpu->kvm))
+ return 1;
+ if (cr8_prev <= cr8)
+ return 1;
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+ }
+ };
+ break;
+ case 2: /* clts */
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
+ skip_emulated_instruction(vcpu);
+ vmx_fpu_activate(vcpu);
+ return 1;
+ case 1: /*mov from cr*/
+ switch (cr) {
+ case 3:
+ val = kvm_read_cr3(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ case 8:
+ val = kvm_get_cr8(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ break;
+ case 3: /* lmsw */
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ kvm_lmsw(vcpu, val);
+
+ skip_emulated_instruction(vcpu);
+ return 1;
+ default:
+ break;
+ }
+ vcpu->run->exit_reason = 0;
+ pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+ (int)(exit_qualification >> 4) & 3, cr);
+ return 0;
+}
+
+static int handle_dr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int dr, reg;
+
+ /* Do not handle if the CPL > 0, will trigger GP on re-entry */
+ if (!kvm_require_cpl(vcpu, 0))
+ return 1;
+ dr = vmcs_readl(GUEST_DR7);
+ if (dr & DR7_GD) {
+ /*
+ * As the vm-exit takes precedence over the debug trap, we
+ * need to emulate the latter, either for the host or the
+ * guest debugging itself.
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
+ vcpu->run->debug.arch.dr7 = dr;
+ vcpu->run->debug.arch.pc =
+ vmcs_readl(GUEST_CS_BASE) +
+ vmcs_readl(GUEST_RIP);
+ vcpu->run->debug.arch.exception = DB_VECTOR;
+ vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+ return 0;
+ } else {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ vcpu->arch.dr6 |= DR6_BD;
+ vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ return 1;
+ }
+ }
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+ reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+ if (exit_qualification & TYPE_MOV_FROM_DR) {
+ unsigned long val;
+ if (!kvm_get_dr(vcpu, dr, &val))
+ kvm_register_write(vcpu, reg, val);
+ } else
+ kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ vmcs_writel(GUEST_DR7, val);
+}
+
+static int handle_cpuid(struct kvm_vcpu *vcpu)
+{
+ kvm_emulate_cpuid(vcpu);
+ return 1;
+}
+
+static int handle_rdmsr(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ u64 data;
+
+ if (vmx_get_msr(vcpu, ecx, &data)) {
+ trace_kvm_msr_read_ex(ecx);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ trace_kvm_msr_read(ecx, data);
+
+ /* FIXME: handling of bits 32:63 of rax, rdx */
+ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
+ vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static int handle_wrmsr(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
+ | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+
+ if (vmx_set_msr(vcpu, ecx, data) != 0) {
+ trace_kvm_msr_write_ex(ecx, data);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ trace_kvm_msr_write(ecx, data);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 1;
+}
+
+static int handle_interrupt_window(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ /* clear pending irq */
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ++vcpu->stat.irq_window_exits;
+
+ /*
+ * If the user space waits to inject interrupts, exit as soon as
+ * possible
+ */
+ if (!irqchip_in_kernel(vcpu->kvm) &&
+ vcpu->run->request_interrupt_window &&
+ !kvm_cpu_has_interrupt(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+ return 0;
+ }
+ return 1;
+}
+
+static int handle_halt(struct kvm_vcpu *vcpu)
+{
+ skip_emulated_instruction(vcpu);
+ return kvm_emulate_halt(vcpu);
+}
+
+static int handle_vmcall(struct kvm_vcpu *vcpu)
+{
+ skip_emulated_instruction(vcpu);
+ kvm_emulate_hypercall(vcpu);
+ return 1;
+}
+
+static int handle_vmx_insn(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int handle_invd(struct kvm_vcpu *vcpu)
+{
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
+static int handle_invlpg(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ kvm_mmu_invlpg(vcpu, exit_qualification);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static int handle_wbinvd(struct kvm_vcpu *vcpu)
+{
+ skip_emulated_instruction(vcpu);
+ kvm_emulate_wbinvd(vcpu);
+ return 1;
+}
+
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
+{
+ u64 new_bv = kvm_read_edx_eax(vcpu);
+ u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+
+ if (kvm_set_xcr(vcpu, index, new_bv) == 0)
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
+static int handle_task_switch(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification;
+ bool has_error_code = false;
+ u32 error_code = 0;
+ u16 tss_selector;
+ int reason, type, idt_v;
+
+ idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ reason = (u32)exit_qualification >> 30;
+ if (reason == TASK_SWITCH_GATE && idt_v) {
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = false;
+ if (cpu_has_virtual_nmis())
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ break;
+ case INTR_TYPE_EXT_INTR:
+ case INTR_TYPE_SOFT_INTR:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ case INTR_TYPE_HARD_EXCEPTION:
+ if (vmx->idt_vectoring_info &
+ VECTORING_INFO_DELIVER_CODE_MASK) {
+ has_error_code = true;
+ error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ }
+ /* fall through */
+ case INTR_TYPE_SOFT_EXCEPTION:
+ kvm_clear_exception_queue(vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+ tss_selector = exit_qualification;
+
+ if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+ type != INTR_TYPE_EXT_INTR &&
+ type != INTR_TYPE_NMI_INTR))
+ skip_emulated_instruction(vcpu);
+
+ if (kvm_task_switch(vcpu, tss_selector, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
+
+ /* clear all local breakpoint enable flags */
+ vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
+
+ /*
+ * TODO: What about debug traps on tss switch?
+ * Are we supposed to inject them and update dr6?
+ */
+
+ return 1;
+}
+
+static int handle_ept_violation(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ gpa_t gpa;
+ int gla_validity;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ if (exit_qualification & (1 << 6)) {
+ printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
+ return -EINVAL;
+ }
+
+ gla_validity = (exit_qualification >> 7) & 0x3;
+ if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+ printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
+ printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+ (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+ vmcs_readl(GUEST_LINEAR_ADDRESS));
+ printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+ (long unsigned int)exit_qualification);
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ return 0;
+ }
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ trace_kvm_page_fault(gpa, exit_qualification);
+ return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+}
+
+static u64 ept_rsvd_mask(u64 spte, int level)
+{
+ int i;
+ u64 mask = 0;
+
+ for (i = 51; i > kvm_x86_phys_bits; i--)
+ mask |= (1ULL << i);
+
+ if (level > 2)
+ /* bits 7:3 reserved */
+ mask |= 0xf8;
+ else if (level == 2) {
+ if (spte & (1ULL << 7))
+ /* 2MB ref, bits 20:12 reserved */
+ mask |= 0x1ff000;
+ else
+ /* bits 6:3 reserved */
+ mask |= 0x78;
+ }
+
+ return mask;
+}
+
+static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
+ int level)
+{
+ printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
+
+ /* 010b (write-only) */
+ WARN_ON((spte & 0x7) == 0x2);
+
+ /* 110b (write/execute) */
+ WARN_ON((spte & 0x7) == 0x6);
+
+ /* 100b (execute-only) and value not supported by logical processor */
+ if (!cpu_has_vmx_ept_execute_only())
+ WARN_ON((spte & 0x7) == 0x4);
+
+ /* not 000b */
+ if ((spte & 0x7)) {
+ u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
+
+ if (rsvd_bits != 0) {
+ printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
+ __func__, rsvd_bits);
+ WARN_ON(1);
+ }
+
+ if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
+ u64 ept_mem_type = (spte & 0x38) >> 3;
+
+ if (ept_mem_type == 2 || ept_mem_type == 3 ||
+ ept_mem_type == 7) {
+ printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
+ __func__, ept_mem_type);
+ WARN_ON(1);
+ }
+ }
+ }
+}
+
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
+{
+ u64 sptes[4];
+ int nr_sptes, i;
+ gpa_t gpa;
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+
+ printk(KERN_ERR "EPT: Misconfiguration.\n");
+ printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
+
+ nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
+
+ for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
+ ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
+
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+
+ return 0;
+}
+
+static int handle_nmi_window(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ /* clear pending NMI */
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ ++vcpu->stat.nmi_window_exits;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 1;
+}
+
+static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ enum emulation_result err = EMULATE_DONE;
+ int ret = 1;
+ u32 cpu_exec_ctrl;
+ bool intr_window_requested;
+
+ cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
+
+ while (!guest_state_valid(vcpu)) {
+ if (intr_window_requested
+ && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+ return handle_interrupt_window(&vmx->vcpu);
+
+ err = emulate_instruction(vcpu, 0);
+
+ if (err == EMULATE_DO_MMIO) {
+ ret = 0;
+ goto out;
+ }
+
+ if (err != EMULATE_DONE)
+ return 0;
+
+ if (signal_pending(current))
+ goto out;
+ if (need_resched())
+ schedule();
+ }
+
+ vmx->emulation_required = 0;
+out:
+ return ret;
+}
+
+/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+ skip_emulated_instruction(vcpu);
+ kvm_vcpu_on_spin(vcpu);
+
+ return 1;
+}
+
+static int handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume. Otherwise they set the kvm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
+ [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
+ [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
+ [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
+ [EXIT_REASON_IO_INSTRUCTION] = handle_io,
+ [EXIT_REASON_CR_ACCESS] = handle_cr,
+ [EXIT_REASON_DR_ACCESS] = handle_dr,
+ [EXIT_REASON_CPUID] = handle_cpuid,
+ [EXIT_REASON_MSR_READ] = handle_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
+ [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
+ [EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_INVD] = handle_invd,
+ [EXIT_REASON_INVLPG] = handle_invlpg,
+ [EXIT_REASON_VMCALL] = handle_vmcall,
+ [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
+ [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
+ [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
+ [EXIT_REASON_VMPTRST] = handle_vmx_insn,
+ [EXIT_REASON_VMREAD] = handle_vmx_insn,
+ [EXIT_REASON_VMRESUME] = handle_vmx_insn,
+ [EXIT_REASON_VMWRITE] = handle_vmx_insn,
+ [EXIT_REASON_VMOFF] = handle_vmx_insn,
+ [EXIT_REASON_VMON] = handle_vmx_insn,
+ [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
+ [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+ [EXIT_REASON_WBINVD] = handle_wbinvd,
+ [EXIT_REASON_XSETBV] = handle_xsetbv,
+ [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
+ [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
+ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
+ [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
+};
+
+static const int kvm_vmx_max_exit_handlers =
+ ARRAY_SIZE(kvm_vmx_exit_handlers);
+
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ *info1 = vmcs_readl(EXIT_QUALIFICATION);
+ *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+}
+
+/*
+ * The guest has exited. See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exit_reason = vmx->exit_reason;
+ u32 vectoring_info = vmx->idt_vectoring_info;
+
+ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+
+ /* If guest state is invalid, start emulating */
+ if (vmx->emulation_required && emulate_invalid_guest_state)
+ return handle_invalid_guest_state(vcpu);
+
+ if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = exit_reason;
+ return 0;
+ }
+
+ if (unlikely(vmx->fail)) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = vmcs_read32(VM_INSTRUCTION_ERROR);
+ return 0;
+ }
+
+ if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason != EXIT_REASON_TASK_SWITCH))
+ printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+ "(0x%x) and exit reason is 0x%x\n",
+ __func__, vectoring_info, exit_reason);
+
+ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+ if (vmx_interrupt_allowed(vcpu)) {
+ vmx->soft_vnmi_blocked = 0;
+ } else if (vmx->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->soft_vnmi_blocked = 0;
+ }
+ }
+
+ if (exit_reason < kvm_vmx_max_exit_handlers
+ && kvm_vmx_exit_handlers[exit_reason])
+ return kvm_vmx_exit_handlers[exit_reason](vcpu);
+ else {
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = exit_reason;
+ }
+ return 0;
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ if (irr == -1 || tpr < irr) {
+ vmcs_write32(TPR_THRESHOLD, 0);
+ return;
+ }
+
+ vmcs_write32(TPR_THRESHOLD, irr);
+}
+
+static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
+{
+ u32 exit_intr_info = vmx->exit_intr_info;
+
+ /* Handle machine checks before interrupts are enabled */
+ if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+ || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
+ && is_machine_check(exit_intr_info)))
+ kvm_machine_check();
+
+ /* We need to handle NMIs before interrupts are enabled */
+ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
+ (exit_intr_info & INTR_INFO_VALID_MASK)) {
+ kvm_before_handle_nmi(&vmx->vcpu);
+ asm("int $2");
+ kvm_after_handle_nmi(&vmx->vcpu);
+ }
+}
+
+static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
+{
+ u32 exit_intr_info = vmx->exit_intr_info;
+ bool unblock_nmi;
+ u8 vector;
+ bool idtv_info_valid;
+
+ idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+ if (cpu_has_virtual_nmis()) {
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
+ */
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ } else if (unlikely(vmx->soft_vnmi_blocked))
+ vmx->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+}
+
+static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
+ u32 idt_vectoring_info,
+ int instr_len_field,
+ int error_code_field)
+{
+ u8 vector;
+ int type;
+ bool idtv_info_valid;
+
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+ vmx->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&vmx->vcpu);
+ kvm_clear_interrupt_queue(&vmx->vcpu);
+
+ if (!idtv_info_valid)
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+
+ vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+ type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vmx->vcpu.arch.nmi_injected = true;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Clear bit "block by NMI" before VM entry if a NMI
+ * delivery faulted.
+ */
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ break;
+ case INTR_TYPE_SOFT_EXCEPTION:
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(instr_len_field);
+ /* fall through */
+ case INTR_TYPE_HARD_EXCEPTION:
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+ u32 err = vmcs_read32(error_code_field);
+ kvm_queue_exception_e(&vmx->vcpu, vector, err);
+ } else
+ kvm_queue_exception(&vmx->vcpu, vector);
+ break;
+ case INTR_TYPE_SOFT_INTR:
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(instr_len_field);
+ /* fall through */
+ case INTR_TYPE_EXT_INTR:
+ kvm_queue_interrupt(&vmx->vcpu, vector,
+ type == INTR_TYPE_SOFT_INTR);
+ break;
+ default:
+ break;
+ }
+}
+
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+ __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
+ VM_EXIT_INSTRUCTION_LEN,
+ IDT_VECTORING_ERROR_CODE);
+}
+
+static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ __vmx_complete_interrupts(to_vmx(vcpu),
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ VM_ENTRY_INSTRUCTION_LEN,
+ VM_ENTRY_EXCEPTION_ERROR_CODE);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+}
+
+#ifdef CONFIG_X86_64
+#define R "r"
+#define Q "q"
+#else
+#define R "e"
+#define Q "l"
+#endif
+
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+ vmx->entry_time = ktime_get();
+
+ /* Don't enter VMX if guest state is invalid, let the exit handler
+ start emulation until we arrive back to a valid state */
+ if (vmx->emulation_required && emulate_invalid_guest_state)
+ return;
+
+ if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+ /* When single-stepping over STI and MOV SS, we must clear the
+ * corresponding interruptibility bits in the guest state. Otherwise
+ * vmentry fails as it then expects bit 14 (BS) in pending debug
+ * exceptions being set, but that's not correct for the guest debugging
+ * case. */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+ asm(
+ /* Store host registers */
+ "push %%"R"dx; push %%"R"bp;"
+ "push %%"R"cx \n\t"
+ "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+ "je 1f \n\t"
+ "mov %%"R"sp, %c[host_rsp](%0) \n\t"
+ __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+ "1: \n\t"
+ /* Reload cr2 if changed */
+ "mov %c[cr2](%0), %%"R"ax \n\t"
+ "mov %%cr2, %%"R"dx \n\t"
+ "cmp %%"R"ax, %%"R"dx \n\t"
+ "je 2f \n\t"
+ "mov %%"R"ax, %%cr2 \n\t"
+ "2: \n\t"
+ /* Check if vmlaunch of vmresume is needed */
+ "cmpl $0, %c[launched](%0) \n\t"
+ /* Load guest registers. Don't clobber flags. */
+ "mov %c[rax](%0), %%"R"ax \n\t"
+ "mov %c[rbx](%0), %%"R"bx \n\t"
+ "mov %c[rdx](%0), %%"R"dx \n\t"
+ "mov %c[rsi](%0), %%"R"si \n\t"
+ "mov %c[rdi](%0), %%"R"di \n\t"
+ "mov %c[rbp](%0), %%"R"bp \n\t"
+#ifdef CONFIG_X86_64
+ "mov %c[r8](%0), %%r8 \n\t"
+ "mov %c[r9](%0), %%r9 \n\t"
+ "mov %c[r10](%0), %%r10 \n\t"
+ "mov %c[r11](%0), %%r11 \n\t"
+ "mov %c[r12](%0), %%r12 \n\t"
+ "mov %c[r13](%0), %%r13 \n\t"
+ "mov %c[r14](%0), %%r14 \n\t"
+ "mov %c[r15](%0), %%r15 \n\t"
+#endif
+ "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+
+ /* Enter guest mode */
+ "jne .Llaunched \n\t"
+ __ex(ASM_VMX_VMLAUNCH) "\n\t"
+ "jmp .Lkvm_vmx_return \n\t"
+ ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
+ ".Lkvm_vmx_return: "
+ /* Save guest registers, load host registers, keep flags */
+ "xchg %0, (%%"R"sp) \n\t"
+ "mov %%"R"ax, %c[rax](%0) \n\t"
+ "mov %%"R"bx, %c[rbx](%0) \n\t"
+ "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
+ "mov %%"R"dx, %c[rdx](%0) \n\t"
+ "mov %%"R"si, %c[rsi](%0) \n\t"
+ "mov %%"R"di, %c[rdi](%0) \n\t"
+ "mov %%"R"bp, %c[rbp](%0) \n\t"
+#ifdef CONFIG_X86_64
+ "mov %%r8, %c[r8](%0) \n\t"
+ "mov %%r9, %c[r9](%0) \n\t"
+ "mov %%r10, %c[r10](%0) \n\t"
+ "mov %%r11, %c[r11](%0) \n\t"
+ "mov %%r12, %c[r12](%0) \n\t"
+ "mov %%r13, %c[r13](%0) \n\t"
+ "mov %%r14, %c[r14](%0) \n\t"
+ "mov %%r15, %c[r15](%0) \n\t"
+#endif
+ "mov %%cr2, %%"R"ax \n\t"
+ "mov %%"R"ax, %c[cr2](%0) \n\t"
+
+ "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
+ "setbe %c[fail](%0) \n\t"
+ : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+ [launched]"i"(offsetof(struct vcpu_vmx, launched)),
+ [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+ [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
+ [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
+ [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
+ [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
+ [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
+ [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
+ [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
+ [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
+#ifdef CONFIG_X86_64
+ [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
+ [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
+ [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
+ [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
+ [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
+ [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
+ [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
+ [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
+#endif
+ [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
+ : "cc", "memory"
+ , R"ax", R"bx", R"di", R"si"
+#ifdef CONFIG_X86_64
+ , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#endif
+ );
+
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_PDPTR)
+ | (1 << VCPU_EXREG_CR3));
+ vcpu->arch.regs_dirty = 0;
+
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+ asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+ vmx->launched = 1;
+
+ vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ vmx_complete_atomic_exit(vmx);
+ vmx_recover_nmi_blocking(vmx);
+ vmx_complete_interrupts(vmx);
+}
+
+#undef R
+#undef Q
+
+static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (vmx->vmcs) {
+ vcpu_clear(vmx);
+ free_vmcs(vmx->vmcs);
+ vmx->vmcs = NULL;
+ }
+}
+
+static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ free_vpid(vmx);
+ vmx_free_vmcs(vcpu);
+ kfree(vmx->guest_msrs);
+ kvm_vcpu_uninit(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, vmx);
+}
+
+static inline void vmcs_init(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
+
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(phys_addr);
+
+ vmcs_clear(vmcs);
+
+ if (!vmm_exclusive)
+ kvm_cpu_vmxoff();
+}
+
+static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+{
+ int err;
+ struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ int cpu;
+
+ if (!vmx)
+ return ERR_PTR(-ENOMEM);
+
+ allocate_vpid(vmx);
+
+ err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
+ if (err)
+ goto free_vcpu;
+
+ vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!vmx->guest_msrs) {
+ err = -ENOMEM;
+ goto uninit_vcpu;
+ }
+
+ vmx->vmcs = alloc_vmcs();
+ if (!vmx->vmcs)
+ goto free_msrs;
+
+ vmcs_init(vmx->vmcs);
+
+ cpu = get_cpu();
+ vmx_vcpu_load(&vmx->vcpu, cpu);
+ vmx->vcpu.cpu = cpu;
+ err = vmx_vcpu_setup(vmx);
+ vmx_vcpu_put(&vmx->vcpu);
+ put_cpu();
+ if (err)
+ goto free_vmcs;
+ if (vm_need_virtualize_apic_accesses(kvm))
+ if (alloc_apic_access_page(kvm) != 0)
+ goto free_vmcs;
+
+ if (enable_ept) {
+ if (!kvm->arch.ept_identity_map_addr)
+ kvm->arch.ept_identity_map_addr =
+ VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+ if (alloc_identity_pagetable(kvm) != 0)
+ goto free_vmcs;
+ }
+
+ return &vmx->vcpu;
+
+free_vmcs:
+ free_vmcs(vmx->vmcs);
+free_msrs:
+ kfree(vmx->guest_msrs);
+uninit_vcpu:
+ kvm_vcpu_uninit(&vmx->vcpu);
+free_vcpu:
+ free_vpid(vmx);
+ kmem_cache_free(kvm_vcpu_cache, vmx);
+ return ERR_PTR(err);
+}
+
+static void __init vmx_check_processor_compat(void *rtn)
+{
+ struct vmcs_config vmcs_conf;
+
+ *(int *)rtn = 0;
+ if (setup_vmcs_config(&vmcs_conf) < 0)
+ *(int *)rtn = -EIO;
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
+ printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
+ smp_processor_id());
+ *(int *)rtn = -EIO;
+ }
+}
+
+static int get_ept_level(void)
+{
+ return VMX_EPT_DEFAULT_GAW + 1;
+}
+
+static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ u64 ret;
+
+ /* For VT-d and EPT combination
+ * 1. MMIO: always map as UC
+ * 2. EPT with VT-d:
+ * a. VT-d without snooping control feature: can't guarantee the
+ * result, try to trust guest.
+ * b. VT-d with snooping control feature: snooping control feature of
+ * VT-d engine can guarantee the cache correctness. Just set it
+ * to WB to keep consistent with host. So the same as item 3.
+ * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
+ * consistent with host MTRR
+ */
+ if (is_mmio)
+ ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+ else if (vcpu->kvm->arch.iommu_domain &&
+ !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
+ ret = kvm_get_guest_memory_type(vcpu, gfn) <<
+ VMX_EPT_MT_EPTE_SHIFT;
+ else
+ ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
+ | VMX_EPT_IPAT_BIT;
+
+ return ret;
+}
+
+#define _ER(x) { EXIT_REASON_##x, #x }
+
+static const struct trace_print_flags vmx_exit_reasons_str[] = {
+ _ER(EXCEPTION_NMI),
+ _ER(EXTERNAL_INTERRUPT),
+ _ER(TRIPLE_FAULT),
+ _ER(PENDING_INTERRUPT),
+ _ER(NMI_WINDOW),
+ _ER(TASK_SWITCH),
+ _ER(CPUID),
+ _ER(HLT),
+ _ER(INVLPG),
+ _ER(RDPMC),
+ _ER(RDTSC),
+ _ER(VMCALL),
+ _ER(VMCLEAR),
+ _ER(VMLAUNCH),
+ _ER(VMPTRLD),
+ _ER(VMPTRST),
+ _ER(VMREAD),
+ _ER(VMRESUME),
+ _ER(VMWRITE),
+ _ER(VMOFF),
+ _ER(VMON),
+ _ER(CR_ACCESS),
+ _ER(DR_ACCESS),
+ _ER(IO_INSTRUCTION),
+ _ER(MSR_READ),
+ _ER(MSR_WRITE),
+ _ER(MWAIT_INSTRUCTION),
+ _ER(MONITOR_INSTRUCTION),
+ _ER(PAUSE_INSTRUCTION),
+ _ER(MCE_DURING_VMENTRY),
+ _ER(TPR_BELOW_THRESHOLD),
+ _ER(APIC_ACCESS),
+ _ER(EPT_VIOLATION),
+ _ER(EPT_MISCONFIG),
+ _ER(WBINVD),
+ { -1, NULL }
+};
+
+#undef _ER
+
+static int vmx_get_lpage_level(void)
+{
+ if (enable_ept && !cpu_has_vmx_ept_1g_page())
+ return PT_DIRECTORY_LEVEL;
+ else
+ /* For shadow and EPT supported 1GB page */
+ return PT_PDPE_LEVEL;
+}
+
+static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exec_control;
+
+ vmx->rdtscp_enabled = false;
+ if (vmx_rdtscp_supported()) {
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ if (exec_control & SECONDARY_EXEC_RDTSCP) {
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
+ vmx->rdtscp_enabled = true;
+ else {
+ exec_control &= ~SECONDARY_EXEC_RDTSCP;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ exec_control);
+ }
+ }
+ }
+}
+
+static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+}
+
+static struct kvm_x86_ops vmx_x86_ops = {
+ .cpu_has_kvm_support = cpu_has_kvm_support,
+ .disabled_by_bios = vmx_disabled_by_bios,
+ .hardware_setup = hardware_setup,
+ .hardware_unsetup = hardware_unsetup,
+ .check_processor_compatibility = vmx_check_processor_compat,
+ .hardware_enable = hardware_enable,
+ .hardware_disable = hardware_disable,
+ .cpu_has_accelerated_tpr = report_flexpriority,
+
+ .vcpu_create = vmx_create_vcpu,
+ .vcpu_free = vmx_free_vcpu,
+ .vcpu_reset = vmx_vcpu_reset,
+
+ .prepare_guest_switch = vmx_save_host_state,
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
+ .set_guest_debug = set_guest_debug,
+ .get_msr = vmx_get_msr,
+ .set_msr = vmx_set_msr,
+ .get_segment_base = vmx_get_segment_base,
+ .get_segment = vmx_get_segment,
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
+ .decache_cr3 = vmx_decache_cr3,
+ .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
+ .set_cr0 = vmx_set_cr0,
+ .set_cr3 = vmx_set_cr3,
+ .set_cr4 = vmx_set_cr4,
+ .set_efer = vmx_set_efer,
+ .get_idt = vmx_get_idt,
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+ .set_dr7 = vmx_set_dr7,
+ .cache_reg = vmx_cache_reg,
+ .get_rflags = vmx_get_rflags,
+ .set_rflags = vmx_set_rflags,
+ .fpu_activate = vmx_fpu_activate,
+ .fpu_deactivate = vmx_fpu_deactivate,
+
+ .tlb_flush = vmx_flush_tlb,
+
+ .run = vmx_vcpu_run,
+ .handle_exit = vmx_handle_exit,
+ .skip_emulated_instruction = skip_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
+ .patch_hypercall = vmx_patch_hypercall,
+ .set_irq = vmx_inject_irq,
+ .set_nmi = vmx_inject_nmi,
+ .queue_exception = vmx_queue_exception,
+ .cancel_injection = vmx_cancel_injection,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
+
+ .set_tss_addr = vmx_set_tss_addr,
+ .get_tdp_level = get_ept_level,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vmx_get_exit_info,
+ .exit_reasons_str = vmx_exit_reasons_str,
+
+ .get_lpage_level = vmx_get_lpage_level,
+
+ .cpuid_update = vmx_cpuid_update,
+
+ .rdtscp_supported = vmx_rdtscp_supported,
+
+ .set_supported_cpuid = vmx_set_supported_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .write_tsc_offset = vmx_write_tsc_offset,
+ .adjust_tsc_offset = vmx_adjust_tsc_offset,
+
+ .set_tdp_cr3 = vmx_set_cr3,
+};
+
+static int __init vmx_init(void)
+{
+ int r, i;
+
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ for (i = 0; i < NR_VMX_MSR; ++i)
+ kvm_define_shared_msr(i, vmx_msr_index[i]);
+
+ vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_io_bitmap_a)
+ return -ENOMEM;
+
+ vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_io_bitmap_b) {
+ r = -ENOMEM;
+ goto out;
+ }
+
+ vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy) {
+ r = -ENOMEM;
+ goto out1;
+ }
+
+ vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode) {
+ r = -ENOMEM;
+ goto out2;
+ }
+
+ /*
+ * Allow direct access to the PC debug port (it is often used for I/O
+ * delays, but the vmexits simply slow things down).
+ */
+ memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
+ clear_bit(0x80, vmx_io_bitmap_a);
+
+ memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
+
+ memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+ memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+
+ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
+ if (r)
+ goto out3;
+
+ vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+ vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+ vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+
+ if (enable_ept) {
+ bypass_guest_pf = 0;
+ kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
+ VMX_EPT_EXECUTABLE_MASK);
+ kvm_enable_tdp();
+ } else
+ kvm_disable_tdp();
+
+ if (bypass_guest_pf)
+ kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
+
+ return 0;
+
+out3:
+ free_page((unsigned long)vmx_msr_bitmap_longmode);
+out2:
+ free_page((unsigned long)vmx_msr_bitmap_legacy);
+out1:
+ free_page((unsigned long)vmx_io_bitmap_b);
+out:
+ free_page((unsigned long)vmx_io_bitmap_a);
+ return r;
+}
+
+static void __exit vmx_exit(void)
+{
+ free_page((unsigned long)vmx_msr_bitmap_legacy);
+ free_page((unsigned long)vmx_msr_bitmap_longmode);
+ free_page((unsigned long)vmx_io_bitmap_b);
+ free_page((unsigned long)vmx_io_bitmap_a);
+
+ kvm_exit();
+}
+
+module_init(vmx_init)
+module_exit(vmx_exit)
diff --git a/linux/x86/x86.c b/linux/x86/x86.c
new file mode 100644
index 0000000..a6233d2
--- /dev/null
+++ b/linux/x86/x86.c
@@ -0,0 +1,6404 @@
+#ifndef KVM_UNIFDEF_H
+#define KVM_UNIFDEF_H
+
+#ifdef __i386__
+#ifndef CONFIG_X86_32
+#define CONFIG_X86_32 1
+#endif
+#endif
+
+#ifdef __x86_64__
+#ifndef CONFIG_X86_64
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+#if defined(__i386__) || defined (__x86_64__)
+#ifndef CONFIG_X86
+#define CONFIG_X86 1
+#endif
+#endif
+
+#ifdef __ia64__
+#ifndef CONFIG_IA64
+#define CONFIG_IA64 1
+#endif
+#endif
+
+#ifdef __PPC__
+#ifndef CONFIG_PPC
+#define CONFIG_PPC 1
+#endif
+#endif
+
+#ifdef __s390__
+#ifndef CONFIG_S390
+#define CONFIG_S390 1
+#endif
+#endif
+
+#endif
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * derived from drivers/kvm/kvm_main.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include "irq.h"
+#include "mmu.h"
+#include "i8254.h"
+#include "tss.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+
+
+#include <linux/interrupt.h>
+#include <linux/kvm.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+#include <linux/cpufreq.h>
+#include <linux/user-return-notifier.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <trace/events/kvm.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#include <asm/debugreg.h>
+#include <asm/msr.h>
+#include <asm/desc.h>
+#include <asm/mtrr.h>
+#include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/pvclock.h>
+#include <asm/div64.h>
+
+#define MAX_IO_MSRS 256
+#define CR0_RESERVED_BITS \
+ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+#define CR4_RESERVED_BITS \
+ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
+ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXSAVE \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
+
+#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+#define KVM_MAX_MCE_BANKS 32
+#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
+#else
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
+#endif
+
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu);
+static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 *entries);
+
+struct kvm_x86_ops *kvm_x86_ops;
+EXPORT_SYMBOL_GPL(kvm_x86_ops);
+
+int ignore_msrs = 0;
+module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+
+#define KVM_NR_SHARED_MSRS 16
+
+struct kvm_shared_msrs_global {
+ int nr;
+ u32 msrs[KVM_NR_SHARED_MSRS];
+};
+
+struct kvm_shared_msrs {
+ struct kvm_user_return_notifier urn;
+ bool registered;
+ struct kvm_shared_msr_values {
+ u64 host;
+ u64 curr;
+ } values[KVM_NR_SHARED_MSRS];
+};
+
+static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
+static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+ { "pf_fixed", VCPU_STAT(pf_fixed) },
+ { "pf_guest", VCPU_STAT(pf_guest) },
+ { "tlb_flush", VCPU_STAT(tlb_flush) },
+ { "invlpg", VCPU_STAT(invlpg) },
+ { "exits", VCPU_STAT(exits) },
+ { "io_exits", VCPU_STAT(io_exits) },
+ { "mmio_exits", VCPU_STAT(mmio_exits) },
+ { "signal_exits", VCPU_STAT(signal_exits) },
+ { "irq_window", VCPU_STAT(irq_window_exits) },
+ { "nmi_window", VCPU_STAT(nmi_window_exits) },
+ { "halt_exits", VCPU_STAT(halt_exits) },
+ { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+ { "hypercalls", VCPU_STAT(hypercalls) },
+ { "request_irq", VCPU_STAT(request_irq_exits) },
+ { "irq_exits", VCPU_STAT(irq_exits) },
+ { "host_state_reload", VCPU_STAT(host_state_reload) },
+ { "efer_reload", VCPU_STAT(efer_reload) },
+ { "fpu_reload", VCPU_STAT(fpu_reload) },
+ { "insn_emulation", VCPU_STAT(insn_emulation) },
+ { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+ { "irq_injections", VCPU_STAT(irq_injections) },
+ { "nmi_injections", VCPU_STAT(nmi_injections) },
+ { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
+ { "mmu_pte_write", VM_STAT(mmu_pte_write) },
+ { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
+ { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
+ { "mmu_flooded", VM_STAT(mmu_flooded) },
+ { "mmu_recycled", VM_STAT(mmu_recycled) },
+ { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+ { "mmu_unsync", VM_STAT(mmu_unsync) },
+ { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+ { "largepages", VM_STAT(lpages) },
+ { NULL }
+};
+
+u64 __read_mostly host_xcr0;
+
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
+{
+ int i;
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+ vcpu->arch.apf.gfns[i] = ~0;
+}
+
+static void kvm_on_user_return(struct kvm_user_return_notifier *urn)
+{
+ unsigned slot;
+ struct kvm_shared_msrs *locals
+ = container_of(urn, struct kvm_shared_msrs, urn);
+ struct kvm_shared_msr_values *values;
+
+ for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
+ values = &locals->values[slot];
+ if (values->host != values->curr) {
+ wrmsrl(shared_msrs_global.msrs[slot], values->host);
+ values->curr = values->host;
+ }
+ }
+ locals->registered = false;
+ kvm_user_return_notifier_unregister(urn);
+}
+
+static void shared_msr_update(unsigned slot, u32 msr)
+{
+ struct kvm_shared_msrs *smsr;
+ u64 value;
+
+ smsr = &__get_cpu_var(shared_msrs);
+ /* only read, and nobody should modify it at this time,
+ * so don't need lock */
+ if (slot >= shared_msrs_global.nr) {
+ printk(KERN_ERR "kvm: invalid MSR slot!");
+ return;
+ }
+ rdmsrl_safe(msr, &value);
+ smsr->values[slot].host = value;
+ smsr->values[slot].curr = value;
+}
+
+void kvm_define_shared_msr(unsigned slot, u32 msr)
+{
+ if (slot >= shared_msrs_global.nr)
+ shared_msrs_global.nr = slot + 1;
+ shared_msrs_global.msrs[slot] = msr;
+ /* we need ensured the shared_msr_global have been updated */
+ smp_wmb();
+}
+EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+
+static void kvm_shared_msr_cpu_online(void)
+{
+ unsigned i;
+
+ for (i = 0; i < shared_msrs_global.nr; ++i)
+ shared_msr_update(i, shared_msrs_global.msrs[i]);
+}
+
+void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+{
+ struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+
+ if (((value ^ smsr->values[slot].curr) & mask) == 0)
+ return;
+ smsr->values[slot].curr = value;
+ wrmsrl(shared_msrs_global.msrs[slot], value);
+ if (!smsr->registered) {
+ smsr->urn.on_user_return = kvm_on_user_return;
+ kvm_user_return_notifier_register(&smsr->urn);
+ smsr->registered = true;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+
+static void drop_user_return_notifiers(void *ignore)
+{
+ struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+
+ if (smsr->registered)
+ kvm_on_user_return(&smsr->urn);
+}
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
+{
+ if (irqchip_in_kernel(vcpu->kvm))
+ return vcpu->arch.apic_base;
+ else
+ return vcpu->arch.apic_base;
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_base);
+
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
+{
+ /* TODO: reserve bits check */
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_lapic_set_base(vcpu, data);
+ else
+ vcpu->arch.apic_base = data;
+}
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+
+#define EXCPT_BENIGN 0
+#define EXCPT_CONTRIBUTORY 1
+#define EXCPT_PF 2
+
+static int exception_class(int vector)
+{
+ switch (vector) {
+ case PF_VECTOR:
+ return EXCPT_PF;
+ case DE_VECTOR:
+ case TS_VECTOR:
+ case NP_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ return EXCPT_CONTRIBUTORY;
+ default:
+ break;
+ }
+ return EXCPT_BENIGN;
+}
+
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+ unsigned nr, bool has_error, u32 error_code,
+ bool reinject)
+{
+ u32 prev_nr;
+ int class1, class2;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ if (!vcpu->arch.exception.pending) {
+ queue:
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.has_error_code = has_error;
+ vcpu->arch.exception.nr = nr;
+ vcpu->arch.exception.error_code = error_code;
+ vcpu->arch.exception.reinject = reinject;
+ return;
+ }
+
+ /* to check exception */
+ prev_nr = vcpu->arch.exception.nr;
+ if (prev_nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+ class1 = exception_class(prev_nr);
+ class2 = exception_class(nr);
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+ || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ /* generate double fault per SDM Table 5-5 */
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.has_error_code = true;
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ } else
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ goto queue;
+}
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, false);
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception);
+
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception);
+
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
+ ++vcpu->stat.pf_guest;
+ vcpu->arch.cr2 = fault->address;
+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+}
+
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
+ if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
+ vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
+ else
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+}
+
+void kvm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.nmi_pending = 1;
+}
+EXPORT_SYMBOL_GPL(kvm_inject_nmi);
+
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code, false);
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
+
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
+
+/*
+ * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
+ * a #GP and return false.
+ */
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
+{
+ if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
+ return true;
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return false;
+}
+EXPORT_SYMBOL_GPL(kvm_require_cpl);
+
+/*
+ * This function will be used to read from the physical memory of the currently
+ * running guest. The difference to kvm_read_guest_page is that this function
+ * can read from guest physical or from the guest's guest physical memory.
+ */
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gfn_t ngfn, void *data, int offset, int len,
+ u32 access)
+{
+ gfn_t real_gfn;
+ gpa_t ngpa;
+
+ ngpa = gfn_to_gpa(ngfn);
+ real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
+ if (real_gfn == UNMAPPED_GVA)
+ return -EFAULT;
+
+ real_gfn = gpa_to_gfn(real_gfn);
+
+ return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
+
+int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+ void *data, int offset, int len, u32 access)
+{
+ return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
+ data, offset, len, access);
+}
+
+/*
+ * Load the pae pdptrs. Return true is they are all valid.
+ */
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
+{
+ gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
+ unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+ int i;
+ int ret;
+ u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
+
+ ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
+ offset * sizeof(u64), sizeof(pdpte),
+ PFERR_USER_MASK|PFERR_WRITE_MASK);
+ if (ret < 0) {
+ ret = 0;
+ goto out;
+ }
+ for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+ if (is_present_gpte(pdpte[i]) &&
+ (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
+ ret = 0;
+ goto out;
+ }
+ }
+ ret = 1;
+
+ memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
+out:
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(load_pdptrs);
+
+static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
+ bool changed = true;
+ int offset;
+ gfn_t gfn;
+ int r;
+
+ if (is_long_mode(vcpu) || !is_pae(vcpu))
+ return false;
+
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ return true;
+
+ gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
+ offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
+ r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
+ PFERR_USER_MASK | PFERR_WRITE_MASK);
+ if (r < 0)
+ goto out;
+ changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
+out:
+
+ return changed;
+}
+
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
+ X86_CR0_CD | X86_CR0_NW;
+
+ cr0 |= X86_CR0_ET;
+
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL)
+ return 1;
+#endif
+
+ cr0 &= ~CR0_RESERVED_BITS;
+
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return 1;
+
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return 1;
+
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+#ifdef CONFIG_X86_64
+ if ((vcpu->arch.efer & EFER_LME)) {
+ int cs_db, cs_l;
+
+ if (!is_pae(vcpu))
+ return 1;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ if (cs_l)
+ return 1;
+ } else
+#endif
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+ kvm_read_cr3(vcpu)))
+ return 1;
+ }
+
+ kvm_x86_ops->set_cr0(vcpu, cr0);
+
+ if ((cr0 ^ old_cr0) & X86_CR0_PG)
+ kvm_clear_async_pf_completion_queue(vcpu);
+
+ if ((cr0 ^ old_cr0) & update_bits)
+ kvm_mmu_reset_context(vcpu);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr0);
+
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+{
+ (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+}
+EXPORT_SYMBOL_GPL(kvm_lmsw);
+
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ u64 xcr0;
+
+ /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ xcr0 = xcr;
+ if (kvm_x86_ops->get_cpl(vcpu) != 0)
+ return 1;
+ if (!(xcr0 & XSTATE_FP))
+ return 1;
+ if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+ return 1;
+ if (xcr0 & ~host_xcr0)
+ return 1;
+ vcpu->arch.xcr0 = xcr0;
+ vcpu->guest_xcr0_loaded = 0;
+ return 0;
+}
+
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ if (__kvm_set_xcr(vcpu, index, xcr)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static void update_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ if (!best)
+ return;
+
+ /* Update OSXSAVE bit */
+ if (kvm_cpu_has_xsave && best->function == 0x1) {
+ best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+ best->ecx |= bit(X86_FEATURE_OSXSAVE);
+ }
+}
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
+ if (cr4 & CR4_RESERVED_BITS)
+ return 1;
+
+ if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
+ return 1;
+
+ if (is_long_mode(vcpu)) {
+ if (!(cr4 & X86_CR4_PAE))
+ return 1;
+ } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+ && ((cr4 ^ old_cr4) & pdptr_bits)
+ && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+ kvm_read_cr3(vcpu)))
+ return 1;
+
+ if (cr4 & X86_CR4_VMXE)
+ return 1;
+
+ kvm_x86_ops->set_cr4(vcpu, cr4);
+
+ if ((cr4 ^ old_cr4) & pdptr_bits)
+ kvm_mmu_reset_context(vcpu);
+
+ if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+ update_cpuid(vcpu);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr4);
+
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_flush_tlb(vcpu);
+ return 0;
+ }
+
+ if (is_long_mode(vcpu)) {
+ if (cr3 & CR3_L_MODE_RESERVED_BITS)
+ return 1;
+ } else {
+ if (is_pae(vcpu)) {
+ if (cr3 & CR3_PAE_RESERVED_BITS)
+ return 1;
+ if (is_paging(vcpu) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ return 1;
+ }
+ /*
+ * We don't check reserved bits in nonpae mode, because
+ * this isn't enforced, and VMware depends on this.
+ */
+ }
+
+ /*
+ * Does the new cr3 value map to physical memory? (Note, we
+ * catch an invalid cr3 even in real-mode, because it would
+ * cause trouble later on when we turn on paging anyway.)
+ *
+ * A real CPU would silently accept an invalid cr3 and would
+ * attempt to use it - with largely undefined (and often hard
+ * to debug) behavior on the guest side.
+ */
+ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
+ return 1;
+ vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ vcpu->arch.mmu.new_cr3(vcpu);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr3);
+
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ if (cr8 & CR8_RESERVED_BITS)
+ return 1;
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_lapic_set_tpr(vcpu, cr8);
+ else
+ vcpu->arch.cr8 = cr8;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr8);
+
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
+{
+ if (irqchip_in_kernel(vcpu->kvm))
+ return kvm_lapic_get_cr8(vcpu);
+ else
+ return vcpu->arch.cr8;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cr8);
+
+static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[dr] = val;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = val;
+ break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return 1; /* #UD */
+ /* fall through */
+ case 6:
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+ break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return 1; /* #UD */
+ /* fall through */
+ default: /* 7 */
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
+ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
+ vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
+ }
+ break;
+ }
+
+ return 0;
+}
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ int res;
+
+ res = __kvm_set_dr(vcpu, dr, val);
+ if (res > 0)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ else if (res < 0)
+ kvm_inject_gp(vcpu, 0);
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+ switch (dr) {
+ case 0 ... 3:
+ *val = vcpu->arch.db[dr];
+ break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return 1;
+ /* fall through */
+ case 6:
+ *val = vcpu->arch.dr6;
+ break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return 1;
+ /* fall through */
+ default: /* 7 */
+ *val = vcpu->arch.dr7;
+ break;
+ }
+
+ return 0;
+}
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+ if (_kvm_get_dr(vcpu, dr, val)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
+
+/*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
+ *
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu. This capabilities test skips MSRs that are
+ * kvm-specific. Those are put in the beginning of the list.
+ */
+
+#define KVM_SAVE_MSRS_BEGIN 8
+static u32 msrs_to_save[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_STAR,
+#ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+};
+
+static unsigned num_msrs_to_save;
+
+static u32 emulated_msrs[] = {
+ MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
+};
+
+static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ u64 old_efer = vcpu->arch.efer;
+
+ if (efer & efer_reserved_bits)
+ return 1;
+
+ if (is_paging(vcpu)
+ && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
+
+ if (efer & EFER_FFXSR) {
+ struct kvm_cpuid_entry2 *feat;
+
+ feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
+ return 1;
+ }
+
+ if (efer & EFER_SVME) {
+ struct kvm_cpuid_entry2 *feat;
+
+ feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
+ return 1;
+ }
+
+ efer &= ~EFER_LMA;
+ efer |= vcpu->arch.efer & EFER_LMA;
+
+ kvm_x86_ops->set_efer(vcpu, efer);
+
+ vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
+
+ /* Update reserved bits */
+ if ((efer ^ old_efer) & EFER_NX)
+ kvm_mmu_reset_context(vcpu);
+
+ return 0;
+}
+
+void kvm_enable_efer_bits(u64 mask)
+{
+ efer_reserved_bits &= ~mask;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+ return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+}
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ return kvm_set_msr(vcpu, index, *data);
+}
+
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+ int version;
+ int r;
+ struct pvclock_wall_clock wc;
+ struct timespec boot;
+
+ if (!wall_clock)
+ return;
+
+ r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+ if (r)
+ return;
+
+ if (version & 1)
+ ++version; /* first time write, random junk */
+
+ ++version;
+
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+ /*
+ * The guest calculates current wall clock time by adding
+ * system time (updated by kvm_guest_time_update below) to the
+ * wall clock specified here. guest system time equals host
+ * system time for us, thus we must fill in host boot time here.
+ */
+ kvm_getboottime(&boot);
+
+ wc.sec = boot.tv_sec;
+ wc.nsec = boot.tv_nsec;
+ wc.version = version;
+
+ kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+ version++;
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+}
+
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+ uint32_t quotient, remainder;
+
+ /* Don't try to replace with do_div(), this one calculates
+ * "(dividend << 32) / divisor" */
+ __asm__ ( "divl %4"
+ : "=a" (quotient), "=d" (remainder)
+ : "0" (0), "1" (dividend), "r" (divisor) );
+ return quotient;
+}
+
+static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+ s8 *pshift, u32 *pmultiplier)
+{
+ uint64_t scaled64;
+ int32_t shift = 0;
+ uint64_t tps64;
+ uint32_t tps32;
+
+ tps64 = base_khz * 1000LL;
+ scaled64 = scaled_khz * 1000LL;
+ while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
+ tps64 >>= 1;
+ shift--;
+ }
+
+ tps32 = (uint32_t)tps64;
+ while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
+ if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
+ scaled64 >>= 1;
+ else
+ tps32 <<= 1;
+ shift++;
+ }
+
+ *pshift = shift;
+ *pmultiplier = div_frac(scaled64, tps32);
+
+ pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
+ __func__, base_khz, scaled_khz, shift, *pmultiplier);
+}
+
+static inline u64 get_kernel_ns(void)
+{
+ struct timespec ts;
+
+ WARN_ON(preemptible());
+ ktime_get_ts(&ts);
+ kvm_monotonic_to_bootbased(&ts);
+ return timespec_to_ns(&ts);
+}
+
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+unsigned long max_tsc_khz;
+
+static inline int kvm_tsc_changes_freq(void)
+{
+ int cpu = get_cpu();
+ int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ cpufreq_quick_get(cpu) != 0;
+ put_cpu();
+ return ret;
+}
+
+static inline u64 nsec_to_cycles(u64 nsec)
+{
+ u64 ret;
+
+ WARN_ON(preemptible());
+ if (kvm_tsc_changes_freq())
+ printk_once(KERN_WARNING
+ "kvm: unreliable cycle conversion on adjustable rate TSC\n");
+ ret = nsec * kvm___this_cpu_read(cpu_tsc_khz);
+ do_div(ret, USEC_PER_SEC);
+ return ret;
+}
+
+static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
+{
+ /* Compute a scale to convert nanoseconds in TSC cycles */
+ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+ &kvm->arch.virtual_tsc_shift,
+ &kvm->arch.virtual_tsc_mult);
+ kvm->arch.virtual_tsc_khz = this_tsc_khz;
+}
+
+static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+ u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+ vcpu->kvm->arch.virtual_tsc_mult,
+ vcpu->kvm->arch.virtual_tsc_shift);
+ tsc += vcpu->arch.last_tsc_write;
+ return tsc;
+}
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 offset, ns, elapsed;
+ unsigned long flags;
+ s64 sdiff;
+
+ spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+ offset = data - kvm_native_read_tsc();
+ ns = get_kernel_ns();
+ elapsed = ns - kvm->arch.last_tsc_nsec;
+ sdiff = data - kvm->arch.last_tsc_write;
+ if (sdiff < 0)
+ sdiff = -sdiff;
+
+ /*
+ * Special case: close write to TSC within 5 seconds of
+ * another CPU is interpreted as an attempt to synchronize
+ * The 5 seconds is to accomodate host load / swapping as
+ * well as any reset of TSC during the boot process.
+ *
+ * In that case, for a reliable TSC, we can match TSC offsets,
+ * or make a best guest using elapsed value.
+ */
+ if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
+ elapsed < 5ULL * NSEC_PER_SEC) {
+ if (!kvm_check_tsc_unstable()) {
+ offset = kvm->arch.last_tsc_offset;
+ pr_debug("kvm: matched tsc offset for %llu\n", data);
+ } else {
+ u64 delta = nsec_to_cycles(elapsed);
+ offset += delta;
+ pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
+ }
+ ns = kvm->arch.last_tsc_nsec;
+ }
+ kvm->arch.last_tsc_nsec = ns;
+ kvm->arch.last_tsc_write = data;
+ kvm->arch.last_tsc_offset = offset;
+ kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ /* Reset of TSC must disable overshoot protection below */
+ vcpu->arch.hv_clock.tsc_timestamp = 0;
+ vcpu->arch.last_tsc_write = data;
+ vcpu->arch.last_tsc_nsec = ns;
+}
+EXPORT_SYMBOL_GPL(kvm_write_tsc);
+
+static int kvm_guest_time_update(struct kvm_vcpu *v)
+{
+ unsigned long flags;
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ void *shared_kaddr;
+ unsigned long this_tsc_khz;
+ s64 kernel_ns, max_kernel_ns;
+ u64 tsc_timestamp;
+
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
+ kernel_ns = get_kernel_ns();
+ this_tsc_khz = kvm___this_cpu_read(cpu_tsc_khz);
+
+ if (unlikely(this_tsc_khz == 0)) {
+ local_irq_restore(flags);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ return 1;
+ }
+
+ /*
+ * We may have to catch up the TSC to match elapsed wall clock
+ * time for two reasons, even if kvmclock is used.
+ * 1) CPU could have been running below the maximum TSC rate
+ * 2) Broken TSC compensation resets the base at each VCPU
+ * entry to avoid unknown leaps of TSC even when running
+ * again on the same CPU. This may cause apparent elapsed
+ * time to disappear, and the guest to stand still or run
+ * very slowly.
+ */
+ if (vcpu->tsc_catchup) {
+ u64 tsc = compute_guest_tsc(v, kernel_ns);
+ if (tsc > tsc_timestamp) {
+ kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+ tsc_timestamp = tsc;
+ }
+ }
+
+ local_irq_restore(flags);
+
+ if (!vcpu->time_page)
+ return 0;
+
+ /*
+ * Time as measured by the TSC may go backwards when resetting the base
+ * tsc_timestamp. The reason for this is that the TSC resolution is
+ * higher than the resolution of the other clock scales. Thus, many
+ * possible measurments of the TSC correspond to one measurement of any
+ * other clock, and so a spread of values is possible. This is not a
+ * problem for the computation of the nanosecond clock; with TSC rates
+ * around 1GHZ, there can only be a few cycles which correspond to one
+ * nanosecond value, and any path through this code will inevitably
+ * take longer than that. However, with the kernel_ns value itself,
+ * the precision may be much lower, down to HZ granularity. If the
+ * first sampling of TSC against kernel_ns ends in the low part of the
+ * range, and the second in the high end of the range, we can get:
+ *
+ * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
+ *
+ * As the sampling errors potentially range in the thousands of cycles,
+ * it is possible such a time value has already been observed by the
+ * guest. To protect against this, we must compute the system time as
+ * observed by the guest and ensure the new system time is greater.
+ */
+ max_kernel_ns = 0;
+ if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+ max_kernel_ns = vcpu->last_guest_tsc -
+ vcpu->hv_clock.tsc_timestamp;
+ max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
+ vcpu->hv_clock.tsc_to_system_mul,
+ vcpu->hv_clock.tsc_shift);
+ max_kernel_ns += vcpu->last_kernel_ns;
+ }
+
+ if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
+ &vcpu->hv_clock.tsc_shift,
+ &vcpu->hv_clock.tsc_to_system_mul);
+ vcpu->hw_tsc_khz = this_tsc_khz;
+ }
+
+ if (max_kernel_ns > kernel_ns)
+ kernel_ns = max_kernel_ns;
+
+ /* With all the info we got, fill in the values */
+ vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
+ vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+ vcpu->last_kernel_ns = kernel_ns;
+ vcpu->last_guest_tsc = tsc_timestamp;
+ vcpu->hv_clock.flags = 0;
+
+ /*
+ * The interface expects us to write an even number signaling that the
+ * update is finished. Since the guest won't see the intermediate
+ * state, we just increase by 2 at the end.
+ */
+ vcpu->hv_clock.version += 2;
+
+ shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+
+ memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+
+ kunmap_atomic(shared_kaddr, KM_USER0);
+
+ mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+ return 0;
+}
+
+static bool msr_mtrr_valid(unsigned msr)
+{
+ switch (msr) {
+ case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+ case MSR_MTRRfix64K_00000:
+ case MSR_MTRRfix16K_80000:
+ case MSR_MTRRfix16K_A0000:
+ case MSR_MTRRfix4K_C0000:
+ case MSR_MTRRfix4K_C8000:
+ case MSR_MTRRfix4K_D0000:
+ case MSR_MTRRfix4K_D8000:
+ case MSR_MTRRfix4K_E0000:
+ case MSR_MTRRfix4K_E8000:
+ case MSR_MTRRfix4K_F0000:
+ case MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ case MSR_IA32_CR_PAT:
+ return true;
+ case 0x2f8:
+ return true;
+ }
+ return false;
+}
+
+static bool valid_pat_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+
+ if (!msr_mtrr_valid(msr))
+ return false;
+
+ if (msr == MSR_IA32_CR_PAT) {
+ for (i = 0; i < 8; i++)
+ if (!valid_pat_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ } else if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ return valid_mtrr_type(data & 0xff);
+}
+
+static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
+ if (!mtrr_valid(vcpu, msr, data))
+ return 1;
+
+ if (msr == MSR_MTRRdefType) {
+ vcpu->arch.mtrr_state.def_type = data;
+ vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
+ } else if (msr == MSR_MTRRfix64K_00000)
+ p[0] = data;
+ else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+ p[1 + msr - MSR_MTRRfix16K_80000] = data;
+ else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+ p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+ else if (msr == MSR_IA32_CR_PAT)
+ vcpu->arch.pat = data;
+ else { /* Variable MTRRs */
+ int idx, is_mtrr_mask;
+ u64 *pt;
+
+ idx = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * idx;
+ if (!is_mtrr_mask)
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+ else
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+ *pt = data;
+ }
+
+ kvm_mmu_reset_context(vcpu);
+ return 0;
+}
+
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
+ switch (msr) {
+ case MSR_IA32_MCG_STATUS:
+ vcpu->arch.mcg_status = data;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ /* only 0 or all 1s can be written to IA32_MCi_CTL
+ * some Linux kernels though clear bit 10 in bank 4 to
+ * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+ * this to avoid an uncatched #GP in the guest
+ */
+ if ((offset & 0x3) == 0 &&
+ data != 0 && (data | (1 << 10)) != ~(u64)0)
+ return -1;
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int lm = is_long_mode(vcpu);
+ u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
+ : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
+ u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+ : kvm->arch.xen_hvm_config.blob_size_32;
+ u32 page_num = data & ~PAGE_MASK;
+ u64 page_addr = data & PAGE_MASK;
+ u8 *page;
+ int r;
+
+ r = -E2BIG;
+ if (page_num >= blob_size)
+ goto out;
+ r = -ENOMEM;
+ page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!page)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
+ goto out_free;
+ if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
+ goto out_free;
+ r = 0;
+out_free:
+ kfree(page);
+out:
+ return r;
+}
+
+static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+ return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+ bool r = false;
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ r = true;
+ break;
+ }
+
+ return r;
+}
+
+static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ kvm->arch.hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!kvm->arch.hv_guest_os_id)
+ kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ u64 gfn;
+ unsigned long addr;
+ u8 instructions[4];
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!kvm->arch.hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ kvm->arch.hv_hypercall = data;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ kvm_x86_ops->patch_hypercall(vcpu, instructions);
+ ((unsigned char *)instructions)[3] = 0xc3; /* ret */
+ if (copy_to_user((void *)addr, instructions, 4))
+ return 1;
+ kvm->arch.hv_hypercall = data;
+ break;
+ }
+ default:
+ pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ switch (msr) {
+ case HV_X64_MSR_APIC_ASSIST_PAGE: {
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+ vcpu->arch.hv_vapic = data;
+ break;
+ }
+ addr = gfn_to_hva(vcpu->kvm, data >>
+ HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ if (clear_user((void *)addr, PAGE_SIZE))
+ return 1;
+ vcpu->arch.hv_vapic = data;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ default:
+ pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+ gpa_t gpa = data & ~0x3f;
+
+ /* Bits 2:5 are resrved, Should be zero */
+ if (data & 0x3c)
+ return 1;
+
+ vcpu->arch.apf.msr_val = data;
+
+ if (!(data & KVM_ASYNC_PF_ENABLED)) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ return 0;
+ }
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
+ return 1;
+
+ vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ kvm_async_pf_wakeup_all(vcpu);
+ return 0;
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ switch (msr) {
+ case MSR_EFER:
+ return set_efer(vcpu, data);
+ case MSR_K7_HWCR:
+ data &= ~(u64)0x40; /* ignore flush filter disable */
+ data &= ~(u64)0x100; /* ignore ignne emulation enable */
+ if (data != 0) {
+ pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+ data);
+ return 1;
+ }
+ break;
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if (data != 0) {
+ pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+ "0x%llx\n", data);
+ return 1;
+ }
+ break;
+ case MSR_AMD64_NB_CFG:
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!data) {
+ /* We support the non-activated case already */
+ break;
+ } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+ /* Values other than LBR and BTF are vendor-specific,
+ thus reserved and should throw a #GP */
+ return 1;
+ }
+ pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
+ break;
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
+ break;
+ case 0x200 ... 0x2ff:
+ return set_msr_mtrr(vcpu, msr, data);
+ case MSR_IA32_APICBASE:
+ kvm_set_apic_base(vcpu, data);
+ break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ return kvm_x2apic_msr_write(vcpu, msr, data);
+ case MSR_IA32_MISC_ENABLE:
+ vcpu->arch.ia32_misc_enable_msr = data;
+ break;
+ case MSR_KVM_WALL_CLOCK_NEW:
+ case MSR_KVM_WALL_CLOCK:
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data);
+ break;
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ case MSR_KVM_SYSTEM_TIME: {
+ if (vcpu->arch.time_page) {
+ kvm_release_page_dirty(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+
+ vcpu->arch.time = data;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ /* we verify if the enable bit is set... */
+ if (!(data & 1))
+ break;
+
+ /* ...but clean it before doing the actual write */
+ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
+
+ vcpu->arch.time_page =
+ gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+
+ if (is_error_page(vcpu->arch.time_page)) {
+ kvm_release_page_clean(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+ break;
+ }
+ case MSR_KVM_ASYNC_PF_EN:
+ if (kvm_pv_enable_async_pf(vcpu, data))
+ return 1;
+ break;
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return set_msr_mce(vcpu, msr, data);
+
+ /* Performance counters are not protected by a CPUID bit,
+ * so we should check all of them in the generic path for the sake of
+ * cross vendor migration.
+ * Writing a zero into the event select MSRs disables them,
+ * which we perfectly emulate ;-). Any other value should be at least
+ * reported, some guests depend on them.
+ */
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
+ if (data != 0)
+ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
+ /* at least RHEL 4 unconditionally writes to the perfctr registers,
+ * so we ignore writes to make it happy.
+ */
+ case MSR_P6_PERFCTR0:
+ case MSR_P6_PERFCTR1:
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
+ case MSR_K7_CLK_CTL:
+ /*
+ * Ignore all writes to this no longer documented MSR.
+ * Writes are only relevant for old K7 processors,
+ * all pre-dating SVM, but a recommended workaround from
+ * AMD for these chips. It is possible to speicify the
+ * affected processor models on the command line, hence
+ * the need to ignore the workaround.
+ */
+ break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+ mutex_lock(&vcpu->kvm->lock);
+ r = set_msr_hyperv_pw(vcpu, msr, data);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+ } else
+ return set_msr_hyperv(vcpu, msr, data);
+ break;
+ default:
+ if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
+ return xen_hvm_config(vcpu, data);
+ if (!ignore_msrs) {
+ pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
+ msr, data);
+ return 1;
+ } else {
+ pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
+ msr, data);
+ break;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
+
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+ return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+}
+
+static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
+ if (!msr_mtrr_valid(msr))
+ return 1;
+
+ if (msr == MSR_MTRRdefType)
+ *pdata = vcpu->arch.mtrr_state.def_type +
+ (vcpu->arch.mtrr_state.enabled << 10);
+ else if (msr == MSR_MTRRfix64K_00000)
+ *pdata = p[0];
+ else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+ *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+ else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+ *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+ else if (msr == MSR_IA32_CR_PAT)
+ *pdata = vcpu->arch.pat;
+ else { /* Variable MTRRs */
+ int idx, is_mtrr_mask;
+ u64 *pt;
+
+ idx = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * idx;
+ if (!is_mtrr_mask)
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+ else
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+ *pdata = *pt;
+ }
+
+ return 0;
+}
+
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
+ switch (msr) {
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ data = 0;
+ break;
+ case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ }
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = kvm->arch.hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = kvm->arch.hv_hypercall;
+ break;
+ default:
+ pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX: {
+ int r;
+ struct kvm_vcpu *v;
+ kvm_for_each_vcpu(r, v, vcpu->kvm)
+ if (v == vcpu)
+ data = r;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ default:
+ pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data;
+
+ switch (msr) {
+ case MSR_IA32_PLATFORM_ID:
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_EBL_CR_POWERON:
+ case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
+ case MSR_K8_SYSCFG:
+ case MSR_K7_HWCR:
+ case MSR_VM_HSAVE_PA:
+ case MSR_P6_PERFCTR0:
+ case MSR_P6_PERFCTR1:
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_PERFCTR0:
+ case MSR_K8_INT_PENDING_MSG:
+ case MSR_AMD64_NB_CFG:
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ data = 0;
+ break;
+ case MSR_MTRRcap:
+ data = 0x500 | KVM_NR_VAR_MTRR;
+ break;
+ case 0x200 ... 0x2ff:
+ return get_msr_mtrr(vcpu, msr, pdata);
+ case 0xcd: /* fsb frequency */
+ data = 3;
+ break;
+ /*
+ * MSR_EBC_FREQUENCY_ID
+ * Conservative value valid for even the basic CPU models.
+ * Models 0,1: 000 in bits 23:21 indicating a bus speed of
+ * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
+ * and 266MHz for model 3, or 4. Set Core Clock
+ * Frequency to System Bus Frequency Ratio to 1 (bits
+ * 31:24) even though these are only valid for CPU
+ * models > 2, however guests may end up dividing or
+ * multiplying by zero otherwise.
+ */
+ case MSR_EBC_FREQUENCY_ID:
+ data = 1 << 24;
+ break;
+ case MSR_IA32_APICBASE:
+ data = kvm_get_apic_base(vcpu);
+ break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ return kvm_x2apic_msr_read(vcpu, msr, pdata);
+ break;
+ case MSR_IA32_MISC_ENABLE:
+ data = vcpu->arch.ia32_misc_enable_msr;
+ break;
+ case MSR_IA32_PERF_STATUS:
+ /* TSC increment by tick */
+ data = 1000ULL;
+ /* CPU multiplier */
+ data |= (((uint64_t)4ULL) << 40);
+ break;
+ case MSR_EFER:
+ data = vcpu->arch.efer;
+ break;
+ case MSR_KVM_WALL_CLOCK:
+ case MSR_KVM_WALL_CLOCK_NEW:
+ data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_SYSTEM_TIME:
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ data = vcpu->arch.time;
+ break;
+ case MSR_KVM_ASYNC_PF_EN:
+ data = vcpu->arch.apf.msr_val;
+ break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return get_msr_mce(vcpu, msr, pdata);
+ case MSR_K7_CLK_CTL:
+ /*
+ * Provide expected ramp-up count for K7. All other
+ * are set to zero, indicating minimum divisors for
+ * every field.
+ *
+ * This prevents guest kernels on AMD host with CPU
+ * type 6, model 8 and higher from exploding due to
+ * the rdmsr failing.
+ */
+ data = 0x20000000;
+ break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+ mutex_lock(&vcpu->kvm->lock);
+ r = get_msr_hyperv_pw(vcpu, msr, pdata);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+ } else
+ return get_msr_hyperv(vcpu, msr, pdata);
+ break;
+ default:
+ if (!ignore_msrs) {
+ pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ } else {
+ pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
+ data = 0;
+ }
+ break;
+ }
+ *pdata = data;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+ struct kvm_msr_entry *entries,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data))
+{
+ int i, idx;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ for (i = 0; i < msrs->nmsrs; ++i)
+ if (do_msr(vcpu, entries[i].index, &entries[i].data))
+ break;
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ return i;
+}
+
+/*
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *user_msrs,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data),
+ int writeback)
+{
+ struct kvm_msrs msrs;
+ struct kvm_msr_entry *entries;
+ int r, n;
+ unsigned size;
+
+ r = -EFAULT;
+ if (copy_from_user(&msrs, user_msrs, sizeof msrs))
+ goto out;
+
+ r = -E2BIG;
+ if (msrs.nmsrs >= MAX_IO_MSRS)
+ goto out;
+
+ r = -ENOMEM;
+ size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+ entries = kmalloc(size, GFP_KERNEL);
+ if (!entries)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_from_user(entries, user_msrs->entries, size))
+ goto out_free;
+
+ r = n = __msr_io(vcpu, &msrs, entries, do_msr);
+ if (r < 0)
+ goto out_free;
+
+ r = -EFAULT;
+ if (writeback && copy_to_user(user_msrs->entries, entries, size))
+ goto out_free;
+
+ r = n;
+
+out_free:
+ kfree(entries);
+out:
+ return r;
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
+{
+ int r;
+
+ switch (ext) {
+ case KVM_CAP_IRQCHIP:
+ case KVM_CAP_HLT:
+ case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
+ case KVM_CAP_SET_TSS_ADDR:
+ case KVM_CAP_EXT_CPUID:
+ case KVM_CAP_CLOCKSOURCE:
+ case KVM_CAP_PIT:
+ case KVM_CAP_NOP_IO_DELAY:
+ case KVM_CAP_MP_STATE:
+#ifdef CONFIG_MMU_NOTIFIER
+ case KVM_CAP_SYNC_MMU:
+#endif
+ case KVM_CAP_USER_NMI:
+ case KVM_CAP_REINJECT_CONTROL:
+ case KVM_CAP_IRQ_INJECT_STATUS:
+ case KVM_CAP_ASSIGN_DEV_IRQ:
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)
+ case KVM_CAP_IRQFD:
+ case KVM_CAP_IOEVENTFD:
+#endif
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_SET_IDENTITY_MAP_ADDR:
+ case KVM_CAP_XEN_HVM:
+ case KVM_CAP_ADJUST_CLOCK:
+ case KVM_CAP_VCPU_EVENTS:
+ case KVM_CAP_HYPERV:
+ case KVM_CAP_HYPERV_VAPIC:
+ case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_PCI_SEGMENT:
+ case KVM_CAP_DEBUGREGS:
+ case KVM_CAP_X86_ROBUST_SINGLESTEP:
+ case KVM_CAP_XSAVE:
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38)
+ case KVM_CAP_ASYNC_PF:
+#endif
+ r = 1;
+ break;
+ case KVM_CAP_COALESCED_MMIO:
+ r = KVM_COALESCED_MMIO_PAGE_OFFSET;
+ break;
+ case KVM_CAP_VAPIC:
+ r = !kvm_x86_ops->cpu_has_accelerated_tpr();
+ break;
+ case KVM_CAP_NR_VCPUS:
+ r = KVM_MAX_VCPUS;
+ break;
+ case KVM_CAP_NR_MEMSLOTS:
+ r = KVM_MEMORY_SLOTS;
+ break;
+ case KVM_CAP_PV_MMU: /* obsolete */
+ r = 0;
+ break;
+ case KVM_CAP_IOMMU:
+ r = iommu_found();
+ break;
+ case KVM_CAP_MCE:
+ r = KVM_MAX_MCE_BANKS;
+ break;
+ case KVM_CAP_XCRS:
+ r = kvm_cpu_has_xsave;
+ break;
+ default:
+ r = 0;
+ break;
+ }
+ return r;
+
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ void *argp = (void *)arg;
+ long r;
+
+ switch (ioctl) {
+ case KVM_GET_MSR_INDEX_LIST: {
+ struct kvm_msr_list *user_msr_list = argp;
+ struct kvm_msr_list msr_list;
+ unsigned n;
+
+ r = -EFAULT;
+ if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+ goto out;
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+ if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+ goto out;
+ r = -E2BIG;
+ if (n < msr_list.nmsrs)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+ num_msrs_to_save * sizeof(u32)))
+ goto out;
+ if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
+ &emulated_msrs,
+ ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_SUPPORTED_CPUID: {
+ struct kvm_cpuid2 *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ u64 mce_cap;
+
+ mce_cap = KVM_MCE_CAP_SUPPORTED;
+ r = -EFAULT;
+ if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+ goto out;
+ r = 0;
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+out:
+ return r;
+}
+
+static void wbinvd_ipi(void *garbage)
+{
+ wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.iommu_domain &&
+ !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /* Address WBINVD may be executed by guest */
+ if (need_emulate_wbinvd(vcpu)) {
+ if (kvm_x86_ops->has_wbinvd_exit())
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+ smp_call_function_single(vcpu->cpu,
+ wbinvd_ipi, NULL, 1);
+ }
+
+ kvm_x86_ops->vcpu_load(vcpu, cpu);
+ if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
+ /* Make sure TSC doesn't go backwards */
+ s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+ kvm_native_read_tsc() - vcpu->arch.last_host_tsc;
+ if (tsc_delta < 0)
+ mark_tsc_unstable("KVM discovered backwards TSC");
+ if (kvm_check_tsc_unstable()) {
+ kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+ vcpu->arch.tsc_catchup = 1;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ if (vcpu->cpu != cpu)
+ kvm_migrate_timers(vcpu);
+ vcpu->cpu = cpu;
+ }
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->vcpu_put(vcpu);
+ kvm_put_guest_fpu(vcpu);
+ vcpu->arch.last_host_tsc = kvm_native_read_tsc();
+}
+
+static int is_efer_nx(void)
+{
+ unsigned long long efer = 0;
+
+ rdmsrl_safe(MSR_EFER, &efer);
+ return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_cpuid_entry2 *e, *entry;
+
+ entry = NULL;
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ e = &vcpu->arch.cpuid_entries[i];
+ if (e->function == 0x80000001) {
+ entry = e;
+ break;
+ }
+ }
+ if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
+ entry->edx &= ~(1 << 20);
+ printk(KERN_INFO "kvm: guest NX capability removed\n");
+ }
+}
+
+/* when an old userspace process fills a new kernel module */
+static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry *entries)
+{
+ int r, i;
+ struct kvm_cpuid_entry *cpuid_entries;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -ENOMEM;
+ cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+ if (!cpuid_entries)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+ goto out_free;
+ for (i = 0; i < cpuid->nent; i++) {
+ vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+ vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+ vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+ vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+ vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+ vcpu->arch.cpuid_entries[i].index = 0;
+ vcpu->arch.cpuid_entries[i].flags = 0;
+ vcpu->arch.cpuid_entries[i].padding[0] = 0;
+ vcpu->arch.cpuid_entries[i].padding[1] = 0;
+ vcpu->arch.cpuid_entries[i].padding[2] = 0;
+ }
+ vcpu->arch.cpuid_nent = cpuid->nent;
+ cpuid_fix_nx_cap(vcpu);
+ r = 0;
+ kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
+ update_cpuid(vcpu);
+
+out_free:
+ vfree(cpuid_entries);
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ vcpu->arch.cpuid_nent = cpuid->nent;
+ kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
+ update_cpuid(vcpu);
+ return 0;
+
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent < vcpu->arch.cpuid_nent)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ return 0;
+
+out:
+ cpuid->nent = vcpu->arch.cpuid_nent;
+ return r;
+}
+
+static void cpuid_mask(u32 *word, int wordnum)
+{
+ *word &= boot_cpu_data.x86_capability[wordnum];
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index)
+{
+ entry->function = function;
+ entry->index = index;
+ cpuid_count(entry->function, entry->index,
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+ entry->flags = 0;
+}
+
+#define F(x) bit(X86_FEATURE_##x)
+
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index, int *nent, int maxnent)
+{
+ unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+ unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+ ? F(GBPAGES) : 0;
+ unsigned f_lm = F(LM);
+#else
+ unsigned f_gbpages = 0;
+ unsigned f_lm = 0;
+#endif
+ unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
+
+ /* cpuid 1.edx */
+ const u32 kvm_supported_word0_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+ 0 /* Reserved, DS, ACPI */ | F(MMX) |
+ F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+ 0 /* HTT, TM, Reserved, PBE */;
+ /* cpuid 0x80000001.edx */
+ const u32 kvm_supported_word1_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* Reserved */ |
+ f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
+ 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+ /* cpuid 1.ecx */
+ const u32 kvm_supported_word4_x86_features =
+ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+ 0 /* DS-CPL, VMX, SMX, EST */ |
+ 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+ 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+ 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+ F(F16C);
+ /* cpuid 0x80000001.ecx */
+ const u32 kvm_supported_word6_x86_features =
+ F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+ F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+ F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+ 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+ do_cpuid_1_ent(entry, function, index);
+ ++*nent;
+
+ switch (function) {
+ case 0:
+ entry->eax = min(entry->eax, (u32)0xd);
+ break;
+ case 1:
+ entry->edx &= kvm_supported_word0_x86_features;
+ cpuid_mask(&entry->edx, 0);
+ entry->ecx &= kvm_supported_word4_x86_features;
+ cpuid_mask(&entry->ecx, 4);
+ /* we support x2apic emulation even if host does not support
+ * it since we emulate x2apic in software */
+ entry->ecx |= F(X2APIC);
+ break;
+ /* function 2 entries are STATEFUL. That is, repeated cpuid commands
+ * may return different values. This forces us to get_cpu() before
+ * issuing the first command, and also to emulate this annoying behavior
+ * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+ case 2: {
+ int t, times = entry->eax & 0xff;
+
+ entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+ for (t = 1; t < times && *nent < maxnent; ++t) {
+ do_cpuid_1_ent(&entry[t], function, 0);
+ entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ ++*nent;
+ }
+ break;
+ }
+ /* function 4 and 0xb have additional index. */
+ case 4: {
+ int i, cache_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until cache_type is zero */
+ for (i = 1; *nent < maxnent; ++i) {
+ cache_type = entry[i - 1].eax & 0x1f;
+ if (!cache_type)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 0xb: {
+ int i, level_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until level_type is zero */
+ for (i = 1; *nent < maxnent; ++i) {
+ level_type = entry[i - 1].ecx & 0xff00;
+ if (!level_type)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 0xd: {
+ int i;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ for (i = 1; *nent < maxnent; ++i) {
+ if (entry[i - 1].eax == 0 && i != 2)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case KVM_CPUID_SIGNATURE: {
+ char signature[12] = "KVMKVMKVM\0\0";
+ u32 *sigptr = (u32 *)signature;
+ entry->eax = 0;
+ entry->ebx = sigptr[0];
+ entry->ecx = sigptr[1];
+ entry->edx = sigptr[2];
+ break;
+ }
+ case KVM_CPUID_FEATURES:
+ entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+ (1 << KVM_FEATURE_NOP_IO_DELAY) |
+ (1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ case 0x80000000:
+ entry->eax = min(entry->eax, 0x8000001a);
+ break;
+ case 0x80000001:
+ entry->edx &= kvm_supported_word1_x86_features;
+ cpuid_mask(&entry->edx, 1);
+ entry->ecx &= kvm_supported_word6_x86_features;
+ cpuid_mask(&entry->ecx, 6);
+ break;
+ }
+
+ kvm_x86_ops->set_supported_cpuid(function, entry);
+
+ put_cpu();
+}
+
+#undef F
+
+static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 *entries)
+{
+ struct kvm_cpuid_entry2 *cpuid_entries;
+ int limit, nent = 0, r = -E2BIG;
+ u32 func;
+
+ if (cpuid->nent < 1)
+ goto out;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+ r = -ENOMEM;
+ cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+ if (!cpuid_entries)
+ goto out;
+
+ do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
+ limit = cpuid_entries[0].eax;
+ for (func = 1; func <= limit && nent < cpuid->nent; ++func)
+ do_cpuid_ent(&cpuid_entries[nent], func, 0,
+ &nent, cpuid->nent);
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
+ do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
+ limit = cpuid_entries[nent - 1].eax;
+ for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
+ do_cpuid_ent(&cpuid_entries[nent], func, 0,
+ &nent, cpuid->nent);
+
+
+
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
+ do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
+ cpuid->nent);
+
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
+ do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
+ cpuid->nent);
+
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
+ r = -EFAULT;
+ if (copy_to_user(entries, cpuid_entries,
+ nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out_free;
+ cpuid->nent = nent;
+ r = 0;
+
+out_free:
+ vfree(cpuid_entries);
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
+{
+ memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
+
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
+{
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+ kvm_apic_post_state_restore(vcpu);
+ update_cr8_intercept(vcpu);
+
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq)
+{
+ if (irq->irq < 0 || irq->irq >= 256)
+ return -EINVAL;
+ if (irqchip_in_kernel(vcpu->kvm))
+ return -ENXIO;
+
+ kvm_queue_interrupt(vcpu, irq->irq, false);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+ kvm_inject_nmi(vcpu);
+
+ return 0;
+}
+
+static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
+ struct kvm_tpr_access_ctl *tac)
+{
+ if (tac->flags)
+ return -EINVAL;
+ vcpu->arch.tpr_access_reporting = !!tac->enabled;
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+ u64 mcg_cap)
+{
+ int r;
+ unsigned bank_num = mcg_cap & 0xff, bank;
+
+ r = -EINVAL;
+ if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
+ goto out;
+ if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+ goto out;
+ r = 0;
+ vcpu->arch.mcg_cap = mcg_cap;
+ /* Init IA32_MCG_CTL to all 1s */
+ if (mcg_cap & MCG_CTL_P)
+ vcpu->arch.mcg_ctl = ~(u64)0;
+ /* Init IA32_MCi_CTL to all 1s */
+ for (bank = 0; bank < bank_num; bank++)
+ vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+ struct kvm_x86_mce *mce)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u64 *banks = vcpu->arch.mce_banks;
+
+ if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+ return -EINVAL;
+ /*
+ * if IA32_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+ vcpu->arch.mcg_ctl != ~(u64)0)
+ return 0;
+ banks += 4 * mce->bank;
+ /*
+ * if IA32_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+ return 0;
+ if (mce->status & MCI_STATUS_UC) {
+ if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+ !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
+ printk(KERN_DEBUG "kvm: set_mce: "
+ "injects mce exception while "
+ "previous one is in progress!\n");
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return 0;
+ }
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+ banks[1] = mce->status;
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ } else if (!(banks[1] & MCI_STATUS_VAL)
+ || !(banks[1] & MCI_STATUS_UC)) {
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ banks[1] = mce->status;
+ } else
+ banks[1] |= MCI_STATUS_OVER;
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ events->exception.injected =
+ vcpu->arch.exception.pending &&
+ !kvm_exception_is_soft(vcpu->arch.exception.nr);
+ events->exception.nr = vcpu->arch.exception.nr;
+ events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+ events->exception.pad = 0;
+ events->exception.error_code = vcpu->arch.exception.error_code;
+
+ events->interrupt.injected =
+ vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
+ events->interrupt.nr = vcpu->arch.interrupt.nr;
+ events->interrupt.soft = 0;
+ events->interrupt.shadow =
+ kvm_x86_ops->get_interrupt_shadow(vcpu,
+ KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
+
+ events->nmi.injected = vcpu->arch.nmi_injected;
+ events->nmi.pending = vcpu->arch.nmi_pending;
+ events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
+ events->nmi.pad = 0;
+
+ events->sipi_vector = vcpu->arch.sipi_vector;
+
+ events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+ | KVM_VCPUEVENT_VALID_SHADOW);
+ memset(&events->reserved, 0, sizeof(events->reserved));
+}
+
+static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+ | KVM_VCPUEVENT_VALID_SHADOW))
+ return -EINVAL;
+
+ vcpu->arch.exception.pending = events->exception.injected;
+ vcpu->arch.exception.nr = events->exception.nr;
+ vcpu->arch.exception.has_error_code = events->exception.has_error_code;
+ vcpu->arch.exception.error_code = events->exception.error_code;
+
+ vcpu->arch.interrupt.pending = events->interrupt.injected;
+ vcpu->arch.interrupt.nr = events->interrupt.nr;
+ vcpu->arch.interrupt.soft = events->interrupt.soft;
+ if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
+ kvm_pic_clear_isr_ack(vcpu->kvm);
+ if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+ kvm_x86_ops->set_interrupt_shadow(vcpu,
+ events->interrupt.shadow);
+
+ vcpu->arch.nmi_injected = events->nmi.injected;
+ if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
+ vcpu->arch.nmi_pending = events->nmi.pending;
+ kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
+ vcpu->arch.sipi_vector = events->sipi_vector;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+ dbgregs->dr6 = vcpu->arch.dr6;
+ dbgregs->dr7 = vcpu->arch.dr7;
+ dbgregs->flags = 0;
+ memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ if (dbgregs->flags)
+ return -EINVAL;
+
+ memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ vcpu->arch.dr6 = dbgregs->dr6;
+ vcpu->arch.dr7 = dbgregs->dr7;
+
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ if (kvm_cpu_has_xsave)
+ memcpy(guest_xsave->region,
+ &vcpu->arch.guest_fpu.state->xsave,
+ kvm_xstate_size);
+ else {
+ memcpy(guest_xsave->region,
+ &vcpu->arch.guest_fpu.state->fxsave,
+ sizeof(struct kvm_i387_fxsave_struct));
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+ XSTATE_FPSSE;
+ }
+}
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ u64 xstate_bv =
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+
+ if (kvm_cpu_has_xsave)
+ memcpy(&vcpu->arch.guest_fpu.state->xsave,
+ guest_xsave->region, kvm_xstate_size);
+ else {
+ if (xstate_bv & ~XSTATE_FPSSE)
+ return -EINVAL;
+ memcpy(&vcpu->arch.guest_fpu.state->fxsave,
+ guest_xsave->region, sizeof(struct kvm_i387_fxsave_struct));
+ }
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ if (!kvm_cpu_has_xsave) {
+ guest_xcrs->nr_xcrs = 0;
+ return;
+ }
+
+ guest_xcrs->nr_xcrs = 1;
+ guest_xcrs->flags = 0;
+ guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+ guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ int i, r = 0;
+
+ if (!kvm_cpu_has_xsave)
+ return -EINVAL;
+
+ if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+ return -EINVAL;
+
+ for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+ /* Only support XCR0 currently */
+ if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+ r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+ guest_xcrs->xcrs[0].value);
+ break;
+ }
+ if (r)
+ r = -EINVAL;
+ return r;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void *argp = (void *)arg;
+ int r;
+ union {
+ struct kvm_lapic_state *lapic;
+ struct kvm_xsave *xsave;
+ struct kvm_xcrs *xcrs;
+ void *buffer;
+ } u;
+
+ u.buffer = NULL;
+ switch (ioctl) {
+ case KVM_GET_LAPIC: {
+ r = -EINVAL;
+ if (!vcpu->arch.apic)
+ goto out;
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+
+ r = -ENOMEM;
+ if (!u.lapic)
+ goto out;
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_LAPIC: {
+ r = -EINVAL;
+ if (!vcpu->arch.apic)
+ goto out;
+ u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.lapic)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
+ goto out;
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_INTERRUPT: {
+ struct kvm_interrupt irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq, argp, sizeof irq))
+ goto out;
+ r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_NMI: {
+ r = kvm_vcpu_ioctl_nmi(vcpu);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_CPUID: {
+ struct kvm_cpuid *cpuid_arg = argp;
+ struct kvm_cpuid cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_SET_CPUID2: {
+ struct kvm_cpuid2 *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_GET_CPUID2: {
+ struct kvm_cpuid2 *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_MSRS:
+ r = msr_io(vcpu, argp, kvm_get_msr, 1);
+ break;
+ case KVM_SET_MSRS:
+ r = msr_io(vcpu, argp, do_set_msr, 0);
+ break;
+ case KVM_TPR_ACCESS_REPORTING: {
+ struct kvm_tpr_access_ctl tac;
+
+ r = -EFAULT;
+ if (copy_from_user(&tac, argp, sizeof tac))
+ goto out;
+ r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &tac, sizeof tac))
+ goto out;
+ r = 0;
+ break;
+ };
+ case KVM_SET_VAPIC_ADDR: {
+ struct kvm_vapic_addr va;
+
+ r = -EINVAL;
+ if (!irqchip_in_kernel(vcpu->kvm))
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&va, argp, sizeof va))
+ goto out;
+ r = 0;
+ kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+ break;
+ }
+ case KVM_X86_SETUP_MCE: {
+ u64 mcg_cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+ break;
+ }
+ case KVM_X86_SET_MCE: {
+ struct kvm_x86_mce mce;
+
+ r = -EFAULT;
+ if (copy_from_user(&mce, argp, sizeof mce))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ break;
+ }
+ case KVM_GET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ r = -EFAULT;
+ if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ break;
+ }
+ case KVM_GET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &dbgregs,
+ sizeof(struct kvm_debugregs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ r = -EFAULT;
+ if (copy_from_user(&dbgregs, argp,
+ sizeof(struct kvm_debugregs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+ break;
+ }
+ case KVM_GET_XSAVE: {
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XSAVE: {
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
+ break;
+ }
+ case KVM_GET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xcrs,
+ sizeof(struct kvm_xcrs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(u.xcrs, argp,
+ sizeof(struct kvm_xcrs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+out:
+ kfree(u.buffer);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
+{
+ int ret;
+
+ if (addr > (unsigned int)(-3 * PAGE_SIZE))
+ return -1;
+ ret = kvm_x86_ops->set_tss_addr(kvm, addr);
+ return ret;
+}
+
+static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
+ u64 ident_addr)
+{
+ kvm->arch.ept_identity_map_addr = ident_addr;
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
+ u32 kvm_nr_mmu_pages)
+{
+ if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
+ return -EINVAL;
+
+ mutex_lock(&kvm->slots_lock);
+ spin_lock(&kvm->mmu_lock);
+
+ kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
+ kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+
+ spin_unlock(&kvm->mmu_lock);
+ mutex_unlock(&kvm->slots_lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+{
+ return kvm->arch.n_max_mmu_pages;
+}
+
+static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic,
+ &pic_irqchip(kvm)->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic,
+ &pic_irqchip(kvm)->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic_irqchip(kvm)->lock);
+ memcpy(&pic_irqchip(kvm)->pics[0],
+ &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic_irqchip(kvm)->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic_irqchip(kvm)->lock);
+ memcpy(&pic_irqchip(kvm)->pics[1],
+ &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic_irqchip(kvm)->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic_irqchip(kvm));
+ return r;
+}
+
+static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int r = 0;
+
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int r = 0;
+
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+ kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int r = 0;
+
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return r;
+}
+
+static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int r = 0, start = 0;
+ u32 prev_legacy, cur_legacy;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
+ sizeof(kvm->arch.vpit->pit_state.channels));
+ kvm->arch.vpit->pit_state.flags = ps->flags;
+ kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_reinject(struct kvm *kvm,
+ struct kvm_reinject_control *control)
+{
+ if (!kvm->arch.vpit)
+ return -ENXIO;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ return 0;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log)
+{
+ int r, i;
+ struct kvm_memory_slot *memslot;
+ unsigned long n;
+ unsigned long is_dirty = 0;
+
+ mutex_lock(&kvm->slots_lock);
+
+ r = -EINVAL;
+ if (log->slot >= KVM_MEMORY_SLOTS)
+ goto out;
+
+ memslot = &kvm->memslots->memslots[log->slot];
+ r = -ENOENT;
+ if (!memslot->dirty_bitmap)
+ goto out;
+
+ n = kvm_dirty_bitmap_bytes(memslot);
+
+ for (i = 0; !is_dirty && i < n/sizeof(long); i++)
+ is_dirty = memslot->dirty_bitmap[i];
+
+ /* If nothing is dirty, don't bother messing with page tables. */
+ if (is_dirty) {
+ struct kvm_memslots *slots, *old_slots;
+ unsigned long *dirty_bitmap;
+
+ dirty_bitmap = memslot->dirty_bitmap_head;
+ if (memslot->dirty_bitmap == dirty_bitmap)
+ dirty_bitmap += n / sizeof(long);
+ memset(dirty_bitmap, 0, n);
+
+ r = -ENOMEM;
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots)
+ goto out;
+ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+ slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+ slots->generation++;
+
+ old_slots = kvm->memslots;
+ rcu_assign_pointer(kvm->memslots, slots);
+ kvm_synchronize_srcu_expedited(&kvm->srcu);
+ dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
+ kfree(old_slots);
+
+ spin_lock(&kvm->mmu_lock);
+ kvm_mmu_slot_remove_write_access(kvm, log->slot);
+ spin_unlock(&kvm->mmu_lock);
+
+ r = -EFAULT;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
+ goto out;
+ } else {
+ r = -EFAULT;
+ if (clear_user(log->dirty_bitmap, n))
+ goto out;
+ }
+
+ r = 0;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm *kvm = filp->private_data;
+ void *argp = (void *)arg;
+ int r = -ENOTTY;
+ /*
+ * This union makes it completely explicit to gcc-3.x
+ * that these two variables' stack usage should be
+ * combined, not added together.
+ */
+ union {
+ struct kvm_pit_state ps;
+ struct kvm_pit_state2 ps2;
+ struct kvm_pit_config pit_config;
+ } u;
+
+ switch (ioctl) {
+ case KVM_SET_TSS_ADDR:
+ r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
+ if (r < 0)
+ goto out;
+ break;
+ case KVM_SET_IDENTITY_MAP_ADDR: {
+ u64 ident_addr;
+
+ r = -EFAULT;
+ if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
+ goto out;
+ r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+ if (r < 0)
+ goto out;
+ break;
+ }
+ case KVM_SET_NR_MMU_PAGES:
+ r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
+ if (r)
+ goto out;
+ break;
+ case KVM_GET_NR_MMU_PAGES:
+ r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
+ break;
+ case KVM_CREATE_IRQCHIP: {
+ struct kvm_pic *vpic;
+
+ mutex_lock(&kvm->lock);
+ r = -EEXIST;
+ if (kvm->arch.vpic)
+ goto create_irqchip_unlock;
+ r = -ENOMEM;
+ vpic = kvm_create_pic(kvm);
+ if (vpic) {
+ r = kvm_ioapic_init(kvm);
+ if (r) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+ &vpic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ kfree(vpic);
+ goto create_irqchip_unlock;
+ }
+ } else
+ goto create_irqchip_unlock;
+ smp_wmb();
+ kvm->arch.vpic = vpic;
+ smp_wmb();
+ r = kvm_setup_default_irq_routing(kvm);
+ if (r) {
+ mutex_lock(&kvm->slots_lock);
+ mutex_lock(&kvm->irq_lock);
+ kvm_ioapic_destroy(kvm);
+ kvm_destroy_pic(kvm);
+ mutex_unlock(&kvm->irq_lock);
+ mutex_unlock(&kvm->slots_lock);
+ }
+ create_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_CREATE_PIT:
+ u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+ goto create_pit;
+ case KVM_CREATE_PIT2:
+ r = -EFAULT;
+ if (copy_from_user(&u.pit_config, argp,
+ sizeof(struct kvm_pit_config)))
+ goto out;
+ create_pit:
+ mutex_lock(&kvm->slots_lock);
+ r = -EEXIST;
+ if (kvm->arch.vpit)
+ goto create_pit_unlock;
+ r = -ENOMEM;
+ kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
+ if (kvm->arch.vpit)
+ r = 0;
+ create_pit_unlock:
+ mutex_unlock(&kvm->slots_lock);
+ break;
+ case KVM_IRQ_LINE_STATUS:
+ case KVM_IRQ_LINE: {
+ struct kvm_irq_level irq_event;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq_event, argp, sizeof irq_event))
+ goto out;
+ r = -ENXIO;
+ if (irqchip_in_kernel(kvm)) {
+ __s32 status;
+ status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event.irq, irq_event.level);
+ if (ioctl == KVM_IRQ_LINE_STATUS) {
+ r = -EFAULT;
+ irq_event.status = status;
+ if (copy_to_user(argp, &irq_event,
+ sizeof irq_event))
+ goto out;
+ }
+ r = 0;
+ }
+ break;
+ }
+ case KVM_GET_IRQCHIP: {
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+ struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+
+ r = -ENOMEM;
+ if (!chip)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(chip, argp, sizeof *chip))
+ goto get_irqchip_out;
+ r = -ENXIO;
+ if (!irqchip_in_kernel(kvm))
+ goto get_irqchip_out;
+ r = kvm_vm_ioctl_get_irqchip(kvm, chip);
+ if (r)
+ goto get_irqchip_out;
+ r = -EFAULT;
+ if (copy_to_user(argp, chip, sizeof *chip))
+ goto get_irqchip_out;
+ r = 0;
+ get_irqchip_out:
+ kfree(chip);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_SET_IRQCHIP: {
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+ struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+
+ r = -ENOMEM;
+ if (!chip)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(chip, argp, sizeof *chip))
+ goto set_irqchip_out;
+ r = -ENXIO;
+ if (!irqchip_in_kernel(kvm))
+ goto set_irqchip_out;
+ r = kvm_vm_ioctl_set_irqchip(kvm, chip);
+ if (r)
+ goto set_irqchip_out;
+ r = 0;
+ set_irqchip_out:
+ kfree(chip);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_GET_PIT: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps, argp, sizeof u.ps))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_PIT2: {
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT2: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_REINJECT_CONTROL: {
+ struct kvm_reinject_control control;
+ r = -EFAULT;
+ if (copy_from_user(&control, argp, sizeof(control)))
+ goto out;
+ r = kvm_vm_ioctl_reinject(kvm, &control);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_XEN_HVM_CONFIG: {
+ r = -EFAULT;
+ if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
+ sizeof(struct kvm_xen_hvm_config)))
+ goto out;
+ r = -EINVAL;
+ if (kvm->arch.xen_hvm_config.flags)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_CLOCK: {
+ struct kvm_clock_data user_ns;
+ u64 now_ns;
+ s64 delta;
+
+ r = -EFAULT;
+ if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
+ goto out;
+
+ r = -EINVAL;
+ if (user_ns.flags)
+ goto out;
+
+ r = 0;
+ local_irq_disable();
+ now_ns = get_kernel_ns();
+ delta = user_ns.clock - now_ns;
+ local_irq_enable();
+ kvm->arch.kvmclock_offset = delta;
+ break;
+ }
+ case KVM_GET_CLOCK: {
+ struct kvm_clock_data user_ns;
+ u64 now_ns;
+
+ local_irq_disable();
+ now_ns = get_kernel_ns();
+ user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
+ local_irq_enable();
+ user_ns.flags = 0;
+ memset(&user_ns.pad, 0, sizeof(user_ns.pad));
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
+ goto out;
+ r = 0;
+ break;
+ }
+
+ default:
+ ;
+ }
+out:
+ return r;
+}
+
+static void kvm_init_msr_list(void)
+{
+ u32 dummy[2];
+ unsigned i, j;
+
+ /* skip the first msrs in the list. KVM-specific */
+ for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
+ if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+ continue;
+ if (j < i)
+ msrs_to_save[j] = msrs_to_save[i];
+ j++;
+ }
+ num_msrs_to_save = j;
+}
+
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+ const void *v)
+{
+ if (vcpu->arch.apic &&
+ !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+ return 0;
+
+ return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+}
+
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
+{
+ if (vcpu->arch.apic &&
+ !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
+ return 0;
+
+ return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+}
+
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
+static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+ return gpa;
+}
+
+static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+ gpa_t t_gpa;
+ struct x86_exception exception;
+
+ BUG_ON(!mmu_is_nested(vcpu));
+
+ /* NPT walks are always user-walks */
+ access |= PFERR_USER_MASK;
+ t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
+
+ return t_gpa;
+}
+
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+}
+
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_FETCH_MASK;
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+}
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_WRITE_MASK;
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+}
+
+/* uses this to access any guest's mapped memory without checking CPL */
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
+}
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 access,
+ struct x86_exception *exception)
+{
+ void *data = val;
+ int r = X86EMUL_CONTINUE;
+
+ while (bytes) {
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
+ exception);
+ unsigned offset = addr & (PAGE_SIZE-1);
+ unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
+ int ret;
+
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
+ if (ret < 0) {
+ r = X86EMUL_IO_NEEDED;
+ goto out;
+ }
+
+ bytes -= toread;
+ data += toread;
+ addr += toread;
+ }
+out:
+ return r;
+}
+
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
+ access | PFERR_FETCH_MASK,
+ exception);
+}
+
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+ exception);
+}
+
+static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
+{
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
+}
+
+static int kvm_write_guest_virt_system(gva_t addr, void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
+{
+ void *data = val;
+ int r = X86EMUL_CONTINUE;
+
+ while (bytes) {
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
+ PFERR_WRITE_MASK,
+ exception);
+ unsigned offset = addr & (PAGE_SIZE-1);
+ unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
+ int ret;
+
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
+ if (ret < 0) {
+ r = X86EMUL_IO_NEEDED;
+ goto out;
+ }
+
+ bytes -= towrite;
+ data += towrite;
+ addr += towrite;
+ }
+out:
+ return r;
+}
+
+static int emulator_read_emulated(unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct x86_exception *exception,
+ struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa;
+
+ if (vcpu->mmio_read_completed) {
+ memcpy(val, vcpu->mmio_data, bytes);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+ vcpu->mmio_phys_addr, *(u64 *)val);
+ vcpu->mmio_read_completed = 0;
+ return X86EMUL_CONTINUE;
+ }
+
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
+
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
+
+ /* For APIC access vmexit */
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto mmio;
+
+ if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
+ == X86EMUL_CONTINUE)
+ return X86EMUL_CONTINUE;
+
+mmio:
+ /*
+ * Is this MMIO handled locally?
+ */
+ if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
+ return X86EMUL_CONTINUE;
+ }
+
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+
+ vcpu->mmio_needed = 1;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+ vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
+
+ return X86EMUL_IO_NEEDED;
+}
+
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
+{
+ int ret;
+
+ ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+ if (ret < 0)
+ return 0;
+ kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+ return 1;
+}
+
+static int emulator_write_emulated_onepage(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct x86_exception *exception,
+ struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa;
+
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
+
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
+
+ /* For APIC access vmexit */
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto mmio;
+
+ if (emulator_write_phys(vcpu, gpa, val, bytes))
+ return X86EMUL_CONTINUE;
+
+mmio:
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+ /*
+ * Is this MMIO handled locally?
+ */
+ if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+ return X86EMUL_CONTINUE;
+
+ vcpu->mmio_needed = 1;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+ vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+ memcpy(vcpu->run->mmio.data, val, bytes);
+
+ return X86EMUL_CONTINUE;
+}
+
+int emulator_write_emulated(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct x86_exception *exception,
+ struct kvm_vcpu *vcpu)
+{
+ /* Crossing a page boundary? */
+ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
+ int rc, now;
+
+ now = -addr & ~PAGE_MASK;
+ rc = emulator_write_emulated_onepage(addr, val, now, exception,
+ vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr += now;
+ val += now;
+ bytes -= now;
+ }
+ return emulator_write_emulated_onepage(addr, val, bytes, exception,
+ vcpu);
+}
+
+#define CMPXCHG_TYPE(t, ptr, old, new) \
+ (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+
+#ifdef CONFIG_X86_64
+# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+#else
+# define CMPXCHG64(ptr, old, new) \
+ (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
+#endif
+
+static int emulator_cmpxchg_emulated(unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct x86_exception *exception,
+ struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa;
+ struct page *page;
+ char *kaddr;
+ bool exchanged;
+
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes > 8 || (bytes & (bytes - 1)))
+ goto emul_write;
+
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+
+ if (gpa == UNMAPPED_GVA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
+
+ if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ goto emul_write;
+
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ goto emul_write;
+ }
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ kaddr += offset_in_page(gpa);
+ switch (bytes) {
+ case 1:
+ exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+ break;
+ case 2:
+ exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+ break;
+ case 4:
+ exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+ break;
+ case 8:
+ exchanged = CMPXCHG64(kaddr, old, new);
+ break;
+ default:
+ BUG();
+ }
+ kunmap_atomic(kaddr, KM_USER0);
+ kvm_release_page_dirty(page);
+
+ if (!exchanged)
+ return X86EMUL_CMPXCHG_FAILED;
+
+ kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
+
+ return X86EMUL_CONTINUE;
+
+emul_write:
+ printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+
+ return emulator_write_emulated(addr, new, bytes, exception, vcpu);
+}
+
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+ /* TODO: String I/O for in kernel device */
+ int r;
+
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
+ return r;
+}
+
+
+static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
+ unsigned int count, struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.pio.count)
+ goto data_avail;
+
+ trace_kvm_pio(0, port, size, count);
+
+ vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = 1;
+ vcpu->arch.pio.count = count;
+ vcpu->arch.pio.size = size;
+
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ data_avail:
+ memcpy(val, vcpu->arch.pio_data, size * count);
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = KVM_EXIT_IO_IN;
+ vcpu->run->io.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = count;
+ vcpu->run->io.port = port;
+
+ return 0;
+}
+
+static int emulator_pio_out_emulated(int size, unsigned short port,
+ const void *val, unsigned int count,
+ struct kvm_vcpu *vcpu)
+{
+ trace_kvm_pio(1, port, size, count);
+
+ vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = 0;
+ vcpu->arch.pio.count = count;
+ vcpu->arch.pio.size = size;
+
+ memcpy(vcpu->arch.pio_data, val, size * count);
+
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = count;
+ vcpu->run->io.port = port;
+
+ return 0;
+}
+
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ return kvm_x86_ops->get_segment_base(vcpu, seg);
+}
+
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+{
+ kvm_mmu_invlpg(vcpu, address);
+ return X86EMUL_CONTINUE;
+}
+
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ if (!need_emulate_wbinvd(vcpu))
+ return X86EMUL_CONTINUE;
+
+ if (kvm_x86_ops->has_wbinvd_exit()) {
+ int cpu = get_cpu();
+
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ wbinvd_ipi, NULL, 1);
+ put_cpu();
+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+ } else
+ wbinvd();
+ return X86EMUL_CONTINUE;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
+int emulate_clts(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ kvm_x86_ops->fpu_activate(vcpu);
+ return X86EMUL_CONTINUE;
+}
+
+int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
+{
+ return _kvm_get_dr(vcpu, dr, dest);
+}
+
+int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
+{
+
+ return __kvm_set_dr(vcpu, dr, value);
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
+{
+ unsigned long value;
+
+ switch (cr) {
+ case 0:
+ value = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ value = vcpu->arch.cr2;
+ break;
+ case 3:
+ value = kvm_read_cr3(vcpu);
+ break;
+ case 4:
+ value = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ value = kvm_get_cr8(vcpu);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ return 0;
+ }
+
+ return value;
+}
+
+static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+{
+ int res = 0;
+
+ switch (cr) {
+ case 0:
+ res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ res = kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ break;
+ case 8:
+ res = kvm_set_cr8(vcpu, val);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ res = -1;
+ }
+
+ return res;
+}
+
+static int emulator_get_cpl(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->get_cpl(vcpu);
+}
+
+static void emulator_get_gdt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->get_gdt(vcpu, dt);
+}
+
+static void emulator_get_idt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->get_idt(vcpu, dt);
+}
+
+static unsigned long emulator_get_cached_segment_base(int seg,
+ struct kvm_vcpu *vcpu)
+{
+ return get_segment_base(vcpu, seg);
+}
+
+static bool emulator_get_cached_descriptor(struct kvm_desc_struct *desc, int seg,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment var;
+
+ kvm_get_segment(vcpu, &var, seg);
+
+ if (var.unusable)
+ return false;
+
+ if (var.g)
+ var.limit >>= 12;
+ kvm_set_desc_limit(desc, var.limit);
+ kvm_set_desc_base(desc, (unsigned long)var.base);
+ desc->type = var.type;
+ desc->s = var.s;
+ desc->dpl = var.dpl;
+ desc->p = var.present;
+ desc->avl = var.avl;
+ desc->l = var.l;
+ desc->d = var.db;
+ desc->g = var.g;
+
+ return true;
+}
+
+static void emulator_set_cached_descriptor(struct kvm_desc_struct *desc, int seg,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment var;
+
+ /* needed to preserve selector */
+ kvm_get_segment(vcpu, &var, seg);
+
+ var.base = kvm_get_desc_base(desc);
+ var.limit = kvm_get_desc_limit(desc);
+ if (desc->g)
+ var.limit = (var.limit << 12) | 0xfff;
+ var.type = desc->type;
+ var.present = desc->p;
+ var.dpl = desc->dpl;
+ var.db = desc->d;
+ var.s = desc->s;
+ var.l = desc->l;
+ var.g = desc->g;
+ var.avl = desc->avl;
+ var.present = desc->p;
+ var.unusable = !var.present;
+ var.padding = 0;
+
+ kvm_set_segment(vcpu, &var, seg);
+ return;
+}
+
+static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ return kvm_seg.selector;
+}
+
+static void emulator_set_segment_selector(u16 sel, int seg,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ kvm_seg.selector = sel;
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
+static struct x86_emulate_ops emulate_ops = {
+ .read_std = kvm_read_guest_virt_system,
+ .write_std = kvm_write_guest_virt_system,
+ .fetch = kvm_fetch_guest_virt,
+ .read_emulated = emulator_read_emulated,
+ .write_emulated = emulator_write_emulated,
+ .cmpxchg_emulated = emulator_cmpxchg_emulated,
+ .pio_in_emulated = emulator_pio_in_emulated,
+ .pio_out_emulated = emulator_pio_out_emulated,
+ .get_cached_descriptor = emulator_get_cached_descriptor,
+ .set_cached_descriptor = emulator_set_cached_descriptor,
+ .get_segment_selector = emulator_get_segment_selector,
+ .set_segment_selector = emulator_set_segment_selector,
+ .get_cached_segment_base = emulator_get_cached_segment_base,
+ .get_gdt = emulator_get_gdt,
+ .get_idt = emulator_get_idt,
+ .get_cr = emulator_get_cr,
+ .set_cr = emulator_set_cr,
+ .cpl = emulator_get_cpl,
+ .get_dr = emulator_get_dr,
+ .set_dr = emulator_set_dr,
+ .set_msr = kvm_set_msr,
+ .get_msr = kvm_get_msr,
+};
+
+static void cache_all_regs(struct kvm_vcpu *vcpu)
+{
+ kvm_register_read(vcpu, VCPU_REGS_RAX);
+ kvm_register_read(vcpu, VCPU_REGS_RSP);
+ kvm_register_read(vcpu, VCPU_REGS_RIP);
+ vcpu->arch.regs_dirty = ~0;
+}
+
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (!(int_shadow & mask))
+ kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+}
+
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ if (ctxt->exception.vector == PF_VECTOR)
+ kvm_propagate_fault(vcpu, &ctxt->exception);
+ else if (ctxt->exception.error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+ ctxt->exception.error_code);
+ else
+ kvm_queue_exception(vcpu, ctxt->exception.vector);
+}
+
+static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ int cs_db, cs_l;
+
+ cache_all_regs(vcpu);
+
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+ vcpu->arch.emulate_ctxt.vcpu = vcpu;
+ vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+ vcpu->arch.emulate_ctxt.mode =
+ (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+ ? X86EMUL_MODE_VM86 : cs_l
+ ? X86EMUL_MODE_PROT64 : cs_db
+ ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ memset(c, 0, sizeof(struct decode_cache));
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+}
+
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
+{
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ int ret;
+
+ init_emulate_ctxt(vcpu);
+
+ vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
+ vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
+ vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
+ ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
+
+ if (ret != X86EMUL_CONTINUE)
+ return EMULATE_FAIL;
+
+ vcpu->arch.emulate_ctxt.eip = c->eip;
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+ if (irq == NMI_VECTOR)
+ vcpu->arch.nmi_pending = false;
+ else
+ vcpu->arch.interrupt.pending = false;
+
+ return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+
+static int handle_emulation_failure(struct kvm_vcpu *vcpu)
+{
+ int r = EMULATE_DONE;
+
+ ++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
+ if (!is_guest_mode(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ r = EMULATE_FAIL;
+ }
+ kvm_queue_exception(vcpu, UD_VECTOR);
+
+ return r;
+}
+
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+
+ if (tdp_enabled)
+ return false;
+
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-entetr the
+ * guest to let CPU execute the instruction.
+ */
+ if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+ return true;
+
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+ if (gpa == UNMAPPED_GVA)
+ return true; /* let cpu generate fault */
+
+ if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+ return true;
+
+ return false;
+}
+
+int x86_emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2,
+ int emulation_type,
+ void *insn,
+ int insn_len)
+{
+ int r;
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+
+ kvm_clear_exception_queue(vcpu);
+ vcpu->arch.mmio_fault_cr2 = cr2;
+ /*
+ * TODO: fix emulate.c to use guest_read/write_register
+ * instead of direct ->regs accesses, can save hundred cycles
+ * on Intel for instructions that don't read/change RSP, for
+ * for example.
+ */
+ cache_all_regs(vcpu);
+
+ if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+ init_emulate_ctxt(vcpu);
+ vcpu->arch.emulate_ctxt.interruptibility = 0;
+ vcpu->arch.emulate_ctxt.have_exception = false;
+ vcpu->arch.emulate_ctxt.perm_ok = false;
+
+ r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
+ if (r == X86EMUL_PROPAGATE_FAULT)
+ goto done;
+
+ trace_kvm_emulate_insn_start(vcpu);
+
+ /* Only allow emulation of specific instructions on #UD
+ * (namely VMMCALL, sysenter, sysexit, syscall)*/
+ if (emulation_type & EMULTYPE_TRAP_UD) {
+ if (!c->twobyte)
+ return EMULATE_FAIL;
+ switch (c->b) {
+ case 0x01: /* VMMCALL */
+ if (c->modrm_mod != 3 || c->modrm_rm != 1)
+ return EMULATE_FAIL;
+ break;
+ case 0x34: /* sysenter */
+ case 0x35: /* sysexit */
+ if (c->modrm_mod != 0 || c->modrm_rm != 0)
+ return EMULATE_FAIL;
+ break;
+ case 0x05: /* syscall */
+ if (c->modrm_mod != 0 || c->modrm_rm != 0)
+ return EMULATE_FAIL;
+ break;
+ default:
+ return EMULATE_FAIL;
+ }
+
+ if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+ return EMULATE_FAIL;
+ }
+
+ ++vcpu->stat.insn_emulation;
+ if (r) {
+ if (reexecute_instruction(vcpu, cr2))
+ return EMULATE_DONE;
+ if (emulation_type & EMULTYPE_SKIP)
+ return EMULATE_FAIL;
+ return handle_emulation_failure(vcpu);
+ }
+ }
+
+ if (emulation_type & EMULTYPE_SKIP) {
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+ return EMULATE_DONE;
+ }
+
+ /* this is needed for vmware backdor interface to work since it
+ changes registers values during IO operation */
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+
+restart:
+ r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
+
+ if (r == EMULATION_FAILED) {
+ if (reexecute_instruction(vcpu, cr2))
+ return EMULATE_DONE;
+
+ return handle_emulation_failure(vcpu);
+ }
+
+done:
+ if (vcpu->arch.emulate_ctxt.have_exception) {
+ inject_emulated_exception(vcpu);
+ r = EMULATE_DONE;
+ } else if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in)
+ vcpu->arch.pio.count = 0;
+ r = EMULATE_DO_MMIO;
+ } else if (vcpu->mmio_needed) {
+ if (vcpu->mmio_is_write)
+ vcpu->mmio_needed = 0;
+ r = EMULATE_DO_MMIO;
+ } else if (r == EMULATION_RESTART)
+ goto restart;
+ else
+ r = EMULATE_DONE;
+
+ toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_emulate_instruction);
+
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
+{
+ unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
+ /* do not return to emulator after return from userspace */
+ vcpu->arch.pio.count = 0;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
+
+static void tsc_bad(void *info)
+{
+ kvm___this_cpu_write(cpu_tsc_khz, 0);
+}
+
+static void tsc_khz_changed(void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ unsigned long khz = 0;
+
+ if (data)
+ khz = freq->new;
+ else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ khz = cpufreq_quick_get(raw_smp_processor_id());
+ if (!khz)
+ khz = tsc_khz;
+ kvm___this_cpu_write(cpu_tsc_khz, khz);
+}
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i, send_ipi = 0;
+
+ /*
+ * We allow guests to temporarily run on slowing clocks,
+ * provided we notify them after, or to run on accelerating
+ * clocks, provided we notify them before. Thus time never
+ * goes backwards.
+ *
+ * However, we have a problem. We can't atomically update
+ * the frequency of a given CPU from this function; it is
+ * merely a notifier, which can be called from any CPU.
+ * Changing the TSC frequency at arbitrary points in time
+ * requires a recomputation of local variables related to
+ * the TSC for each VCPU. We must flag these local variables
+ * to be updated and be sure the update takes place with the
+ * new frequency before any guests proceed.
+ *
+ * Unfortunately, the combination of hotplug CPU and frequency
+ * change creates an intractable locking scenario; the order
+ * of when these callouts happen is undefined with respect to
+ * CPU hotplug, and they can race with each other. As such,
+ * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
+ * undefined; you can actually have a CPU frequency change take
+ * place in between the computation of X and the setting of the
+ * variable. To protect against this problem, all updates of
+ * the per_cpu tsc_khz variable are done in an interrupt
+ * protected IPI, and all callers wishing to update the value
+ * must wait for a synchronous IPI to complete (which is trivial
+ * if the caller is on the CPU already). This establishes the
+ * necessary total order on variable updates.
+ *
+ * Note that because a guest time update may take place
+ * anytime after the setting of the VCPU's request bit, the
+ * correct TSC value must be set before the request. However,
+ * to ensure the update actually makes it to any guest which
+ * starts running in hardware virtualization between the set
+ * and the acquisition of the spinlock, we must also ping the
+ * CPU after setting the request bit.
+ *
+ */
+
+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+ return 0;
+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+ return 0;
+
+ smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->cpu != freq->cpu)
+ continue;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (vcpu->cpu != smp_processor_id())
+ send_ipi = 1;
+ }
+ }
+ spin_unlock(&kvm_lock);
+
+ if (freq->old < freq->new && send_ipi) {
+ /*
+ * We upscale the frequency. Must make the guest
+ * doesn't see old kvmclock values while running with
+ * the new frequency, otherwise we risk the guest sees
+ * time go backwards.
+ *
+ * In case we update the frequency for another cpu
+ * (which might be in guest context) send an interrupt
+ * to kick the cpu out of guest context. Next time
+ * guest context is entered kvmclock will be updated,
+ * so the guest will not see stale values.
+ */
+ smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ }
+ return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+ .notifier_call = kvmclock_cpufreq_notifier
+};
+
+static int kvmclock_cpu_notifier(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+ break;
+ case CPU_DOWN_PREPARE:
+ smp_call_function_single(cpu, tsc_bad, NULL, 1);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kvmclock_cpu_notifier_block = {
+ .notifier_call = kvmclock_cpu_notifier,
+ .priority = -INT_MAX
+};
+
+static void kvm_timer_init(void)
+{
+ int cpu;
+
+ max_tsc_khz = tsc_khz;
+ register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+#ifdef CONFIG_CPU_FREQ
+ struct cpufreq_policy policy;
+ memset(&policy, 0, sizeof(policy));
+ cpu = get_cpu();
+ cpufreq_get_policy(&policy, cpu);
+ if (policy.cpuinfo.max_freq)
+ max_tsc_khz = policy.cpuinfo.max_freq;
+ put_cpu();
+#endif
+ cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+ pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
+ for_each_online_cpu(cpu)
+ smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+}
+
+static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static int kvm_is_in_guest(void)
+{
+ return percpu_read(current_vcpu) != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+ int user_mode = 3;
+
+ if (percpu_read(current_vcpu))
+ user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+
+ return user_mode != 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+ unsigned long ip = 0;
+
+ if (percpu_read(current_vcpu))
+ ip = kvm_rip_read(percpu_read(current_vcpu));
+
+ return ip;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+ .is_in_guest = kvm_is_in_guest,
+ .is_user_mode = kvm_is_user_mode,
+ .get_guest_ip = kvm_get_guest_ip,
+};
+
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ percpu_write(current_vcpu, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
+
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ percpu_write(current_vcpu, NULL);
+}
+EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
+
+int kvm_arch_init(void *opaque)
+{
+ int r;
+ struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+
+ if (kvm_x86_ops) {
+ printk(KERN_ERR "kvm: already loaded the other module\n");
+ r = -EEXIST;
+ goto out;
+ }
+
+ if (!ops->cpu_has_kvm_support()) {
+ printk(KERN_ERR "kvm: no hardware support\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+ if (ops->disabled_by_bios()) {
+ printk(KERN_ERR "kvm: disabled by bios\n");
+#ifndef KVM_TBOOT_ENABLED_WORKS
+
+ printk(KERN_ERR "kvm: if TXT is enabled in the BIOS, disable it\n");
+#endif
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+
+ r = kvm_mmu_module_init();
+ if (r)
+ goto out;
+
+ kvm_init_msr_list();
+
+ kvm_xstate_size_init();
+
+ kvm_x86_ops = ops;
+ kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+ kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
+ PT_DIRTY_MASK, PT64_NX_MASK, 0);
+
+ kvm_timer_init();
+
+ perf_register_guest_info_callbacks(&kvm_guest_cbs);
+
+ if (kvm_cpu_has_xsave)
+ host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
+ return 0;
+
+out:
+ return r;
+}
+
+void kvm_arch_exit(void)
+{
+ perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+ kvm_x86_ops = NULL;
+ kvm_mmu_module_exit();
+}
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.halt_exits;
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+ return 1;
+ } else {
+ vcpu->run->exit_reason = KVM_EXIT_HLT;
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
+ unsigned long a1)
+{
+ if (is_long_mode(vcpu))
+ return a0;
+ else
+ return a0 | ((gpa_t)a1 << 32);
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ u64 param, ingpa, outgpa, ret;
+ uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+ bool fast, longmode;
+ int cs_db, cs_l;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ longmode = is_long_mode(vcpu) && cs_l == 1;
+
+ if (!longmode) {
+ param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+ ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+ outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+ }
+#endif
+
+ code = param & 0xffff;
+ fast = (param >> 16) & 0x1;
+ rep_cnt = (param >> 32) & 0xfff;
+ rep_idx = (param >> 48) & 0xfff;
+
+ trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+
+ switch (code) {
+ case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+ kvm_vcpu_on_spin(vcpu);
+ break;
+ default:
+ res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ ret = res | (((u64)rep_done & 0xfff) << 32);
+ if (longmode) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+ } else {
+ kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+ }
+
+ return 1;
+}
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+ unsigned long nr, a0, a1, a2, a3, ret;
+ int r = 1;
+
+ if (kvm_hv_hypercall_enabled(vcpu->kvm))
+ return kvm_hv_hypercall(vcpu);
+
+ nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
+
+ trace_kvm_hypercall(nr, a0, a1, a2, a3);
+
+ if (!is_long_mode(vcpu)) {
+ nr &= 0xFFFFFFFF;
+ a0 &= 0xFFFFFFFF;
+ a1 &= 0xFFFFFFFF;
+ a2 &= 0xFFFFFFFF;
+ a3 &= 0xFFFFFFFF;
+ }
+
+ if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+ ret = -KVM_EPERM;
+ goto out;
+ }
+
+ switch (nr) {
+ case KVM_HC_VAPIC_POLL_IRQ:
+ ret = 0;
+ break;
+ case KVM_HC_MMU_OP:
+ r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+ break;
+ default:
+ ret = -KVM_ENOSYS;
+ break;
+ }
+out:
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+ ++vcpu->stat.hypercalls;
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
+{
+ char instruction[3];
+ unsigned long rip = kvm_rip_read(vcpu);
+
+ /*
+ * Blow out the MMU to ensure that no other VCPU has an active mapping
+ * to ensure that the updated hypercall appears atomically across all
+ * VCPUs.
+ */
+ kvm_mmu_zap_all(vcpu->kvm);
+
+ kvm_x86_ops->patch_hypercall(vcpu, instruction);
+
+ return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
+}
+
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+ struct kvm_desc_ptr dt = { limit, base };
+
+ kvm_x86_ops->set_gdt(vcpu, &dt);
+}
+
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+ struct kvm_desc_ptr dt = { limit, base };
+
+ kvm_x86_ops->set_idt(vcpu, &dt);
+}
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+ struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+ int j, nent = vcpu->arch.cpuid_nent;
+
+ e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+ /* when no next entry is found, the current entry[i] is reselected */
+ for (j = i + 1; ; j = (j + 1) % nent) {
+ struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+ if (ej->function == e->function) {
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+ return j;
+ }
+ }
+ return 0; /* silence gcc, even though control never reaches here */
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+ u32 function, u32 index)
+{
+ if (e->function != function)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+ !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+ return 0;
+ return 1;
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ int i;
+ struct kvm_cpuid_entry2 *best = NULL;
+
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ struct kvm_cpuid_entry2 *e;
+
+ e = &vcpu->arch.cpuid_entries[i];
+ if (is_matching_cpuid_entry(e, function, index)) {
+ if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+ move_to_next_stateful_cpuid_entry(vcpu, i);
+ best = e;
+ break;
+ }
+ /*
+ * Both basic or both extended?
+ */
+ if (((e->function ^ function) & 0x80000000) == 0)
+ if (!best || e->function > best->function)
+ best = e;
+ }
+ return best;
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
+
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best)
+ return best->eax & 0xff;
+not_found:
+ return 36;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 function, index;
+ struct kvm_cpuid_entry2 *best;
+
+ function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
+ best = kvm_find_cpuid_entry(vcpu, function, index);
+ if (best) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
+ }
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+ trace_kvm_cpuid(function,
+ kvm_register_read(vcpu, VCPU_REGS_RAX),
+ kvm_register_read(vcpu, VCPU_REGS_RBX),
+ kvm_register_read(vcpu, VCPU_REGS_RCX),
+ kvm_register_read(vcpu, VCPU_REGS_RDX));
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
+
+/*
+ * Check if userspace requested an interrupt window, and that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
+{
+ return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
+ vcpu->run->request_interrupt_window &&
+ kvm_arch_interrupt_allowed(vcpu));
+}
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ kvm_run->cr8 = kvm_get_cr8(vcpu);
+ kvm_run->apic_base = kvm_get_apic_base(vcpu);
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_run->ready_for_interrupt_injection = 1;
+ else
+ kvm_run->ready_for_interrupt_injection =
+ kvm_arch_interrupt_allowed(vcpu) &&
+ !kvm_cpu_has_interrupt(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu);
+}
+
+static void vapic_enter(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct page *page;
+
+ if (!apic || !apic->vapic_addr)
+ return;
+
+ page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+
+ vcpu->arch.apic->vapic_page = page;
+}
+
+static void vapic_exit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int idx;
+
+ if (!apic || !apic->vapic_addr)
+ return;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_release_page_dirty(apic->vapic_page);
+ mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+ int max_irr, tpr;
+
+ if (!kvm_x86_ops->update_cr8_intercept)
+ return;
+
+ if (!vcpu->arch.apic)
+ return;
+
+ if (!vcpu->arch.apic->vapic_addr)
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ else
+ max_irr = -1;
+
+ if (max_irr != -1)
+ max_irr >>= 4;
+
+ tpr = kvm_lapic_get_cr8(vcpu);
+
+ kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+}
+
+static void inject_pending_event(struct kvm_vcpu *vcpu)
+{
+ /* try to reinject previous events if any */
+ if (vcpu->arch.exception.pending) {
+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code);
+ kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code,
+ vcpu->arch.exception.reinject);
+ return;
+ }
+
+ if (vcpu->arch.nmi_injected) {
+ kvm_x86_ops->set_nmi(vcpu);
+ return;
+ }
+
+ if (vcpu->arch.interrupt.pending) {
+ kvm_x86_ops->set_irq(vcpu);
+ return;
+ }
+
+ /* try to inject new event if pending */
+ if (vcpu->arch.nmi_pending) {
+ if (kvm_x86_ops->nmi_allowed(vcpu)) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_ops->set_nmi(vcpu);
+ }
+ } else if (kvm_cpu_has_interrupt(vcpu)) {
+ if (kvm_x86_ops->interrupt_allowed(vcpu)) {
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+ false);
+ kvm_x86_ops->set_irq(vcpu);
+ }
+ }
+}
+
+static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+ !vcpu->guest_xcr0_loaded) {
+ /* kvm_set_xcr() also depends on this */
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ vcpu->guest_xcr0_loaded = 1;
+ }
+}
+
+static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_xcr0_loaded) {
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ vcpu->guest_xcr0_loaded = 0;
+ }
+}
+
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+{
+ int r;
+ bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+ vcpu->run->request_interrupt_window;
+
+ if (vcpu->requests) {
+ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+ kvm_mmu_unload(vcpu);
+ if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
+ __kvm_migrate_timers(vcpu);
+ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
+ r = kvm_guest_time_update(vcpu);
+ if (unlikely(r))
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
+ kvm_mmu_sync_roots(vcpu);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ kvm_x86_ops->tlb_flush(vcpu);
+ if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
+ vcpu->fpu_active = 0;
+ kvm_x86_ops->fpu_deactivate(vcpu);
+ }
+ if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+ /* Page is swapped out. Do synthetic halt */
+ vcpu->arch.apf.halted = true;
+ r = 1;
+ goto out;
+ }
+ }
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ goto out;
+
+ if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+ inject_pending_event(vcpu);
+
+ /* enable NMI/IRQ window open exits if needed */
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
+ }
+
+ preempt_disable();
+
+ kvm_x86_ops->prepare_guest_switch(vcpu);
+ if (vcpu->fpu_active)
+ kvm_load_guest_fpu(vcpu);
+ kvm_load_guest_xcr0(vcpu);
+
+ atomic_set(&vcpu->guest_mode, 1);
+ smp_wmb();
+
+ local_irq_disable();
+
+ if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
+ || need_resched() || signal_pending(current)) {
+ atomic_set(&vcpu->guest_mode, 0);
+ smp_wmb();
+ local_irq_enable();
+ preempt_enable();
+ kvm_x86_ops->cancel_injection(vcpu);
+ r = 1;
+ goto out;
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+
+ kvm_guest_enter();
+
+ if (unlikely(vcpu->arch.switch_db_regs)) {
+ set_debugreg(0, 7);
+ set_debugreg(vcpu->arch.eff_db[0], 0);
+ set_debugreg(vcpu->arch.eff_db[1], 1);
+ set_debugreg(vcpu->arch.eff_db[2], 2);
+ set_debugreg(vcpu->arch.eff_db[3], 3);
+ }
+
+ trace_kvm_entry(vcpu->vcpu_id);
+ kvm_x86_ops->run(vcpu);
+
+ /*
+ * If the guest has used debug registers, at least dr7
+ * will be disabled while returning to the host.
+ * If we don't have active breakpoints in the host, we don't
+ * care about the messed up debug address registers. But if
+ * we have some of them active, restore the old state.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_restore();
+
+ kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+
+ atomic_set(&vcpu->guest_mode, 0);
+ smp_wmb();
+ local_irq_enable();
+
+ ++vcpu->stat.exits;
+
+ /*
+ * We must have an instruction between local_irq_enable() and
+ * kvm_guest_exit(), so the timer interrupt isn't delayed by
+ * the interrupt shadow. The stat.exits increment will do nicely.
+ * But we need to prevent reordering, hence this barrier():
+ */
+ barrier();
+
+ kvm_guest_exit();
+
+ preempt_enable();
+
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ /*
+ * Profile KVM exit RIPs:
+ */
+ if (unlikely(prof_on == KVM_PROFILING)) {
+ unsigned long rip = kvm_rip_read(vcpu);
+ profile_hit(KVM_PROFILING, (void *)rip);
+ }
+
+
+ kvm_lapic_sync_from_vapic(vcpu);
+
+ r = kvm_x86_ops->handle_exit(vcpu);
+out:
+ return r;
+}
+
+
+static int __vcpu_run(struct kvm_vcpu *vcpu)
+{
+ int r;
+ struct kvm *kvm = vcpu->kvm;
+
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
+ pr_debug("vcpu %d received sipi with vector # %x\n",
+ vcpu->vcpu_id, vcpu->arch.sipi_vector);
+ kvm_lapic_reset(vcpu);
+ r = kvm_arch_vcpu_reset(vcpu);
+ if (r)
+ return r;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ }
+
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ vapic_enter(vcpu);
+
+ r = 1;
+ while (r > 0) {
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted)
+ r = vcpu_enter_guest(vcpu);
+ else {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_block(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
+ {
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
+ case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.apf.halted = false;
+ break;
+ case KVM_MP_STATE_SIPI_RECEIVED:
+ default:
+ r = -EINTR;
+ break;
+ }
+ }
+ }
+
+ if (r <= 0)
+ break;
+
+ clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ if (kvm_cpu_has_pending_timer(vcpu))
+ kvm_inject_pending_timer_irqs(vcpu);
+
+ if (dm_request_for_irq_injection(vcpu)) {
+ r = -EINTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.request_irq_exits;
+ }
+
+ kvm_check_async_pf_completion(vcpu);
+
+ if (signal_pending(current)) {
+ r = -EINTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ if (need_resched()) {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_resched(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+
+ vapic_exit(vcpu);
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+ sigset_t sigsaved;
+
+ if (!tsk_used_math(current) && kvm_init_fpu(current))
+ return -ENOMEM;
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
+ kvm_vcpu_block(vcpu);
+ clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+ r = -EAGAIN;
+ goto out;
+ }
+
+ /* re-sync apic's tpr */
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+ r = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (vcpu->arch.pio.count || vcpu->mmio_needed) {
+ if (vcpu->mmio_needed) {
+ memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+ vcpu->mmio_read_completed = 1;
+ vcpu->mmio_needed = 0;
+ }
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r != EMULATE_DONE) {
+ r = 0;
+ goto out;
+ }
+ }
+ if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
+ kvm_register_write(vcpu, VCPU_REGS_RAX,
+ kvm_run->hypercall.ret);
+
+ r = __vcpu_run(vcpu);
+
+out:
+ post_kvm_run_save(vcpu);
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+#ifdef CONFIG_X86_64
+ regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+ regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+ regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+ regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+ regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+ regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+ regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+ regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
+#endif
+
+ regs->rip = kvm_rip_read(vcpu);
+ regs->rflags = kvm_get_rflags(vcpu);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
+#ifdef CONFIG_X86_64
+ kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+ kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+ kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+ kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+ kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+ kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+ kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+ kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+#endif
+
+ kvm_rip_write(vcpu, regs->rip);
+ kvm_set_rflags(vcpu, regs->rflags);
+
+ vcpu->arch.exception.pending = false;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 0;
+}
+
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ struct kvm_desc_ptr dt;
+
+ kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ sregs->idt.limit = dt.size;
+ sregs->idt.base = dt.address;
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ sregs->gdt.limit = dt.size;
+ sregs->gdt.base = dt.address;
+
+ sregs->cr0 = kvm_read_cr0(vcpu);
+ sregs->cr2 = vcpu->arch.cr2;
+ sregs->cr3 = kvm_read_cr3(vcpu);
+ sregs->cr4 = kvm_read_cr4(vcpu);
+ sregs->cr8 = kvm_get_cr8(vcpu);
+ sregs->efer = vcpu->arch.efer;
+ sregs->apic_base = kvm_get_apic_base(vcpu);
+
+ memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+
+ if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
+ set_bit(vcpu->arch.interrupt.nr,
+ (unsigned long *)sregs->interrupt_bitmap);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ mp_state->mp_state = vcpu->arch.mp_state;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ vcpu->arch.mp_state = mp_state->mp_state;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+}
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code)
+{
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ int ret;
+
+ init_emulate_ctxt(vcpu);
+
+ ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
+ tss_selector, reason, has_error_code,
+ error_code);
+
+ if (ret)
+ return EMULATE_FAIL;
+
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_task_switch);
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ int mmu_reset_needed = 0;
+ int pending_vec, max_bits;
+ struct kvm_desc_ptr dt;
+
+ dt.size = sregs->idt.limit;
+ dt.address = sregs->idt.base;
+ kvm_x86_ops->set_idt(vcpu, &dt);
+ dt.size = sregs->gdt.limit;
+ dt.address = sregs->gdt.base;
+ kvm_x86_ops->set_gdt(vcpu, &dt);
+
+ vcpu->arch.cr2 = sregs->cr2;
+ mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ vcpu->arch.cr3 = sregs->cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ kvm_set_cr8(vcpu, sregs->cr8);
+
+ mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ kvm_x86_ops->set_efer(vcpu, sregs->efer);
+ kvm_set_apic_base(vcpu, sregs->apic_base);
+
+ mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+ vcpu->arch.cr0 = sregs->cr0;
+
+ mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+ if (sregs->cr4 & X86_CR4_OSXSAVE)
+ update_cpuid(vcpu);
+ if (!is_long_mode(vcpu) && is_pae(vcpu)) {
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ mmu_reset_needed = 1;
+ }
+
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+
+ max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_pic_clear_isr_ack(vcpu->kvm);
+ }
+
+ kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ update_cr8_intercept(vcpu);
+
+ /* Older userspace won't unhalt the vcpu on reset. */
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
+ sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+ !is_protmode(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ unsigned long rflags;
+ int i, r;
+
+ if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
+ r = -EBUSY;
+ if (vcpu->arch.exception.pending)
+ goto out;
+ if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ else
+ kvm_queue_exception(vcpu, BP_VECTOR);
+ }
+
+ /*
+ * Read rflags as long as potentially injected trace flags are still
+ * filtered out.
+ */
+ rflags = kvm_get_rflags(vcpu);
+
+ vcpu->guest_debug = dbg->control;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+ vcpu->guest_debug = 0;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ for (i = 0; i < KVM_NR_DB_REGS; ++i)
+ vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
+ vcpu->arch.switch_db_regs =
+ (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+ } else {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
+ }
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
+ get_segment_base(vcpu, VCPU_SREG_CS);
+
+ /*
+ * Trigger an rflags update that will inject or remove the trace
+ * flags.
+ */
+ kvm_set_rflags(vcpu, rflags);
+
+ kvm_x86_ops->set_guest_debug(vcpu, dbg);
+
+ r = 0;
+
+out:
+
+ return r;
+}
+
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
+{
+ unsigned long vaddr = tr->linear_address;
+ gpa_t gpa;
+ int idx;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ tr->physical_address = gpa;
+ tr->valid = gpa != UNMAPPED_GVA;
+ tr->writeable = 1;
+ tr->usermode = 0;
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct kvm_i387_fxsave_struct *fxsave =
+ &vcpu->arch.guest_fpu.state->fxsave;
+
+ memcpy(fpu->fpr, fxsave->st_space, 128);
+ fpu->fcw = fxsave->cwd;
+ fpu->fsw = fxsave->swd;
+ fpu->ftwx = fxsave->twd;
+ fpu->last_opcode = fxsave->fop;
+ fpu->last_ip = fxsave->rip;
+ fpu->last_dp = fxsave->rdp;
+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct kvm_i387_fxsave_struct *fxsave =
+ &vcpu->arch.guest_fpu.state->fxsave;
+
+ memcpy(fxsave->st_space, fpu->fpr, 128);
+ fxsave->cwd = fpu->fcw;
+ fxsave->swd = fpu->fsw;
+ fxsave->twd = fpu->ftwx;
+ fxsave->fop = fpu->last_opcode;
+ fxsave->rip = fpu->last_ip;
+ fxsave->rdp = fpu->last_dp;
+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+
+ return 0;
+}
+
+int fx_init(struct kvm_vcpu *vcpu)
+{
+ int err;
+
+ err = kvm_fpu_alloc(&vcpu->arch.guest_fpu);
+ if (err)
+ return err;
+
+ kvm_fpu_finit(&vcpu->arch.guest_fpu);
+
+ /*
+ * Ensure guest xcr0 is valid for loading
+ */
+ vcpu->arch.xcr0 = XSTATE_FP;
+
+ vcpu->arch.cr0 |= X86_CR0_ET;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fx_init);
+
+static void fx_free(struct kvm_vcpu *vcpu)
+{
+ kvm_fpu_free(&vcpu->arch.guest_fpu);
+}
+
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_fpu_loaded)
+ return;
+
+ /*
+ * Restore all possible states in the guest,
+ * and assume host would use all available bits.
+ * Guest xcr0 would be loaded later.
+ */
+ kvm_put_guest_xcr0(vcpu);
+ vcpu->guest_fpu_loaded = 1;
+ unlazy_fpu(current);
+ kvm_fpu_restore_checking(&vcpu->arch.guest_fpu);
+ trace_kvm_fpu(1);
+}
+
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ kvm_put_guest_xcr0(vcpu);
+
+ if (!vcpu->guest_fpu_loaded)
+ return;
+
+ vcpu->guest_fpu_loaded = 0;
+ kvm_fpu_save_init(&vcpu->arch.guest_fpu);
+ ++vcpu->stat.fpu_reload;
+ kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+ trace_kvm_fpu(0);
+}
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.time_page) {
+ kvm_release_page_dirty(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+ fx_free(vcpu);
+ kvm_x86_ops->vcpu_free(vcpu);
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
+ unsigned int id)
+{
+ if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+ printk_once(KERN_WARNING
+ "kvm: SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
+ return kvm_x86_ops->vcpu_create(kvm, id);
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ vcpu->arch.mtrr_state.have_fixed = 1;
+ vcpu_load(vcpu);
+ r = kvm_arch_vcpu_reset(vcpu);
+ if (r == 0)
+ r = kvm_mmu_setup(vcpu);
+ vcpu_put(vcpu);
+ if (r < 0)
+ goto free_vcpu;
+
+ return 0;
+free_vcpu:
+ kvm_x86_ops->vcpu_free(vcpu);
+ return r;
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.apf.msr_val = 0;
+
+ vcpu_load(vcpu);
+ kvm_mmu_unload(vcpu);
+ vcpu_put(vcpu);
+
+ fx_free(vcpu);
+ kvm_x86_ops->vcpu_free(vcpu);
+}
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = false;
+
+ vcpu->arch.switch_db_regs = 0;
+ memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+ vcpu->arch.dr6 = DR6_FIXED_1;
+ vcpu->arch.dr7 = DR7_FIXED_1;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.apf.msr_val = 0;
+
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ vcpu->arch.apf.halted = false;
+
+ return kvm_x86_ops->vcpu_reset(vcpu);
+}
+
+int kvm_arch_hardware_enable(void *garbage)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_shared_msr_cpu_online();
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (vcpu->cpu == smp_processor_id())
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ return kvm_x86_ops->hardware_enable(garbage);
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+ kvm_x86_ops->hardware_disable(garbage);
+ drop_user_return_notifiers(garbage);
+}
+
+int kvm_arch_hardware_setup(void)
+{
+ return kvm_x86_ops->hardware_setup();
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+ kvm_x86_ops->hardware_unsetup();
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+ kvm_x86_ops->check_processor_compatibility(rtn);
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ struct kvm *kvm;
+ int r;
+
+ BUG_ON(vcpu->kvm == NULL);
+ kvm = vcpu->kvm;
+
+ vcpu->arch.emulate_ctxt.ops = &emulate_ops;
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.mmu.translate_gpa = translate_gpa;
+ vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+ if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page) {
+ r = -ENOMEM;
+ goto fail;
+ }
+ vcpu->arch.pio_data = page_address(page);
+
+ if (!kvm->arch.virtual_tsc_khz)
+ kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
+
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ goto fail_free_pio_data;
+
+ if (irqchip_in_kernel(kvm)) {
+ r = kvm_create_lapic(vcpu);
+ if (r < 0)
+ goto fail_mmu_destroy;
+ }
+
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL);
+ if (!vcpu->arch.mce_banks) {
+ r = -ENOMEM;
+ goto fail_free_lapic;
+ }
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+ goto fail_free_mce_banks;
+
+ kvm_async_pf_hash_reset(vcpu);
+
+ return 0;
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
+fail_free_lapic:
+ kvm_free_lapic(vcpu);
+fail_mmu_destroy:
+ kvm_mmu_destroy(vcpu);
+fail_free_pio_data:
+ free_page((unsigned long)vcpu->arch.pio_data);
+fail:
+ return r;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ int idx;
+
+ kfree(vcpu->arch.mce_banks);
+ kvm_free_lapic(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_mmu_destroy(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ free_page((unsigned long)vcpu->arch.pio_data);
+}
+
+int kvm_arch_init_vm(struct kvm *kvm)
+{
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+
+ /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
+ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+
+ spin_lock_init(&kvm->arch.tsc_write_lock);
+
+ return 0;
+}
+
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+ vcpu_load(vcpu);
+ kvm_mmu_unload(vcpu);
+ vcpu_put(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+ unsigned int i;
+ struct kvm_vcpu *vcpu;
+
+ /*
+ * Unpin any mmu pages first.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_unload_vcpu_mmu(vcpu);
+ }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+ kvm_free_all_assigned_devices(kvm);
+ kvm_free_pit(kvm);
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ kvm_iommu_unmap_guest(kvm);
+ kfree(kvm->arch.vpic);
+ kfree(kvm->arch.vioapic);
+ kvm_free_vcpus(kvm);
+ if (kvm->arch.apic_access_page)
+ put_page(kvm->arch.apic_access_page);
+ if (kvm->arch.ept_identity_pagetable)
+ put_page(kvm->arch.ept_identity_pagetable);
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc)
+{
+ int npages = memslot->npages;
+ int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+ /* Prevent internal slot pages from being moved by fork()/COW. */
+ if (memslot->id >= KVM_MEMORY_SLOTS)
+ map_flags = MAP_SHARED | MAP_ANONYMOUS;
+
+ /*To keep backward compatibility with older userspace,
+ *x86 needs to hanlde !user_alloc case.
+ */
+ if (!user_alloc) {
+ if (npages && !old.rmap) {
+ unsigned long userspace_addr;
+
+ down_write(&current->mm->mmap_sem);
+ userspace_addr = do_mmap(NULL, 0,
+ npages * PAGE_SIZE,
+ PROT_READ | PROT_WRITE,
+ map_flags,
+ 0);
+ up_write(&current->mm->mmap_sem);
+
+ if (IS_ERR((void *)userspace_addr))
+ return PTR_ERR((void *)userspace_addr);
+
+ memslot->userspace_addr = userspace_addr;
+ }
+ }
+
+
+ return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+
+ int npages = mem->memory_size >> PAGE_SHIFT;
+
+ if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+ int ret;
+
+ down_write(&current->mm->mmap_sem);
+ ret = do_munmap(current->mm, old.userspace_addr,
+ old.npages * PAGE_SIZE);
+ up_write(&current->mm->mmap_sem);
+ if (ret < 0)
+ printk(KERN_WARNING
+ "kvm_vm_ioctl_set_memory_region: "
+ "failed to munmap memory\n");
+ }
+
+ spin_lock(&kvm->mmu_lock);
+ if (!kvm->arch.n_requested_mmu_pages) {
+ unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ }
+
+ kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+void kvm_arch_flush_shadow(struct kvm *kvm)
+{
+ kvm_mmu_zap_all(kvm);
+ kvm_reload_remote_mmus(kvm);
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted)
+ || !list_empty_careful(&vcpu->async_pf.done)
+ || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+ || vcpu->arch.nmi_pending ||
+ (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_has_interrupt(vcpu));
+}
+
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+ int me;
+ int cpu = vcpu->cpu;
+
+ if (waitqueue_active(&vcpu->wq)) {
+ wake_up_interruptible(&vcpu->wq);
+ ++vcpu->stat.halt_wakeup;
+ }
+
+ me = get_cpu();
+ if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+ if (atomic_xchg(&vcpu->guest_mode, 0))
+ kvm_smp_send_reschedule(cpu);
+ put_cpu();
+}
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+ unsigned long current_rip = kvm_rip_read(vcpu) +
+ get_segment_base(vcpu, VCPU_SREG_CS);
+
+ return current_rip == linear_rip;
+}
+EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+
+ rflags = kvm_x86_ops->get_rflags(vcpu);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ rflags &= ~X86_EFLAGS_TF;
+ return rflags;
+}
+EXPORT_SYMBOL_GPL(kvm_get_rflags);
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+ kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+ rflags |= X86_EFLAGS_TF;
+ kvm_x86_ops->set_rflags(vcpu, rflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_set_rflags);
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
+ is_error_page(work->page))
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu.direct_map &&
+ work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+ return;
+
+ vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
+}
+
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+{
+ return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_probe(u32 key)
+{
+ return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+}
+
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ while (vcpu->arch.apf.gfns[key] != ~0)
+ key = kvm_async_pf_next_probe(key);
+
+ vcpu->arch.apf.gfns[key] = gfn;
+}
+
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ int i;
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+ (vcpu->arch.apf.gfns[key] != gfn &&
+ vcpu->arch.apf.gfns[key] != ~0); i++)
+ key = kvm_async_pf_next_probe(key);
+
+ return key;
+}
+
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 i, j, k;
+
+ i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+ while (true) {
+ vcpu->arch.apf.gfns[i] = ~0;
+ do {
+ j = kvm_async_pf_next_probe(j);
+ if (vcpu->arch.apf.gfns[j] == ~0)
+ return;
+ k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+ /*
+ * k lies cyclically in ]i,j]
+ * | i.k.j |
+ * |....j i.k.| or |.k..j i...|
+ */
+ } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+ vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+ i = j;
+ }
+}
+
+static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+ sizeof(val));
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_not_present(work->arch.token, work->gva);
+ kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
+ (vcpu->arch.apf.send_user_only &&
+ kvm_x86_ops->get_cpl(vcpu) == 0))
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_ready(work->arch.token, work->gva);
+ if (is_error_page(work->page))
+ work->arch.token = ~0; /* broadcast wakeup */
+ else
+ kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
+ !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+ vcpu->arch.apf.halted = false;
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+ return true;
+ else
+ return !kvm_event_needs_reinjection(vcpu) &&
+ kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/linux/x86/x86.h b/linux/x86/x86.h
new file mode 100644
index 0000000..c600da8
--- /dev/null
+++ b/linux/x86/x86.h
@@ -0,0 +1,84 @@
+#ifndef ARCH_X86_KVM_X86_H
+#define ARCH_X86_KVM_X86_H
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+
+static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.exception.pending = false;
+}
+
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ bool soft)
+{
+ vcpu->arch.interrupt.pending = true;
+ vcpu->arch.interrupt.soft = soft;
+ vcpu->arch.interrupt.nr = vector;
+}
+
+static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.interrupt.pending = false;
+}
+
+static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
+ vcpu->arch.nmi_injected;
+}
+
+static inline bool kvm_exception_is_soft(unsigned int nr)
+{
+ return (nr == BP_VECTOR) || (nr == OF_VECTOR);
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function, u32 index);
+
+static inline bool is_protmode(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return vcpu->arch.efer & EFER_LMA;
+#else
+ return 0;
+#endif
+}
+
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
+}
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+
+#endif