summaryrefslogtreecommitdiff
path: root/linux
diff options
context:
space:
mode:
authorJosh Wilsdon <jwilsdon@joyent.com>2011-04-06 17:32:34 -0700
committerJosh Wilsdon <jwilsdon@joyent.com>2011-04-06 17:42:30 -0700
commit44f65dde684a09c2319449bac768974120ed5d7f (patch)
tree81e3a1a603f3e30b8c511ee15c56b10f44f51c07 /linux
parentc5e99aab98c3a8ddb8e0e2953c1a3e534d67ca4f (diff)
downloadillumos-kvm-44f65dde684a09c2319449bac768974120ed5d7f.tar.gz
[HVM-29] Apparently I still made a mess because of that .gitignore. This version should actually match the kvm-kmod-2.6.34.
Diffstat (limited to 'linux')
-rw-r--r--linux/include/asm-ia64/kvm.h304
-rw-r--r--linux/include/asm-ia64/kvm_host.h639
-rw-r--r--linux/include/asm-ia64/kvm_para.h71
-rw-r--r--linux/include/asm-x86/hyperv.h11
-rw-r--r--linux/include/asm-x86/kvm.h39
-rw-r--r--linux/include/asm-x86/kvm_emulate.h116
-rw-r--r--linux/include/asm-x86/kvm_host.h327
-rw-r--r--linux/include/asm-x86/kvm_para.h43
-rw-r--r--linux/include/asm-x86/svm.h62
-rw-r--r--linux/include/asm-x86/vmx.h32
-rw-r--r--linux/include/linux/kvm.h52
-rw-r--r--linux/include/linux/kvm_host.h176
-rw-r--r--linux/include/linux/kvm_para.h7
-rw-r--r--linux/include/linux/kvm_types.h11
-rw-r--r--linux/include/trace/events/kvm.h122
-rw-r--r--linux/usr/include/asm-x86/hyperv.h11
-rw-r--r--linux/usr/include/asm-x86/kvm.h39
-rw-r--r--linux/usr/include/asm-x86/kvm_para.h27
-rw-r--r--linux/usr/include/linux/kvm.h52
-rw-r--r--linux/usr/include/linux/kvm_para.h2
-rw-r--r--linux/x86/assigned-dev.c144
-rw-r--r--linux/x86/coalesced_mmio.c7
-rw-r--r--linux/x86/emulate.c3556
-rw-r--r--linux/x86/eventfd.c104
-rw-r--r--linux/x86/i8254.c169
-rw-r--r--linux/x86/i8254.h4
-rw-r--r--linux/x86/i8259.c115
-rw-r--r--linux/x86/ioapic.c6
-rw-r--r--linux/x86/iommu.c129
-rw-r--r--linux/x86/irq.c9
-rw-r--r--linux/x86/irq.h9
-rw-r--r--linux/x86/irq_comm.c22
-rw-r--r--linux/x86/kvm_cache_regs.h39
-rw-r--r--linux/x86/kvm_main.c607
-rw-r--r--linux/x86/kvm_timer.h4
-rw-r--r--linux/x86/lapic.c48
-rw-r--r--linux/x86/mmu.c2103
-rw-r--r--linux/x86/mmu.h9
-rw-r--r--linux/x86/mmu_audit.c344
-rw-r--r--linux/x86/mmutrace.h95
-rw-r--r--linux/x86/paging_tmpl.h563
-rw-r--r--linux/x86/svm.c2097
-rw-r--r--linux/x86/timer.c19
-rw-r--r--linux/x86/trace.h178
-rw-r--r--linux/x86/vmx.c1012
-rw-r--r--linux/x86/x86.c3733
-rw-r--r--linux/x86/x86.h16
47 files changed, 5600 insertions, 11684 deletions
diff --git a/linux/include/asm-ia64/kvm.h b/linux/include/asm-ia64/kvm.h
deleted file mode 100644
index ce31fac..0000000
--- a/linux/include/asm-ia64/kvm.h
+++ /dev/null
@@ -1,304 +0,0 @@
-#ifndef KVM_UNIFDEF_H
-#define KVM_UNIFDEF_H
-
-#ifdef __i386__
-#ifndef CONFIG_X86_32
-#define CONFIG_X86_32 1
-#endif
-#endif
-
-#ifdef __x86_64__
-#ifndef CONFIG_X86_64
-#define CONFIG_X86_64 1
-#endif
-#endif
-
-#if defined(__i386__) || defined (__x86_64__)
-#ifndef CONFIG_X86
-#define CONFIG_X86 1
-#endif
-#endif
-
-#ifdef __ia64__
-#ifndef CONFIG_IA64
-#define CONFIG_IA64 1
-#endif
-#endif
-
-#ifdef __PPC__
-#ifndef CONFIG_PPC
-#define CONFIG_PPC 1
-#endif
-#endif
-
-#ifdef __s390__
-#ifndef CONFIG_S390
-#define CONFIG_S390 1
-#endif
-#endif
-
-#endif
-#ifndef __ASM_IA64_KVM_H
-#define __ASM_IA64_KVM_H
-
-/*
- * kvm structure definitions for ia64
- *
- * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- */
-
-#include <asm/types.h>
-#include <linux/ioctl.h>
-
-/* Select x86 specific features in <linux/kvm.h> */
-#define __KVM_HAVE_IOAPIC
-#define __KVM_HAVE_DEVICE_ASSIGNMENT
-
-/* Architectural interrupt line count. */
-#define KVM_NR_INTERRUPTS 256
-
-#define KVM_IOAPIC_NUM_PINS 48
-
-struct kvm_ioapic_state {
- __u64 base_address;
- __u32 ioregsel;
- __u32 id;
- __u32 irr;
- __u32 pad;
- union {
- __u64 bits;
- struct {
- __u8 vector;
- __u8 delivery_mode:3;
- __u8 dest_mode:1;
- __u8 delivery_status:1;
- __u8 polarity:1;
- __u8 remote_irr:1;
- __u8 trig_mode:1;
- __u8 mask:1;
- __u8 reserve:7;
- __u8 reserved[4];
- __u8 dest_id;
- } fields;
- } redirtbl[KVM_IOAPIC_NUM_PINS];
-};
-
-#define KVM_IRQCHIP_PIC_MASTER 0
-#define KVM_IRQCHIP_PIC_SLAVE 1
-#define KVM_IRQCHIP_IOAPIC 2
-#define KVM_NR_IRQCHIPS 3
-
-#define KVM_CONTEXT_SIZE 8*1024
-
-struct kvm_fpreg {
- union {
- unsigned long bits[2];
- long double __dummy; /* force 16-byte alignment */
- } u;
-};
-
-union context {
- /* 8K size */
- char dummy[KVM_CONTEXT_SIZE];
- struct {
- unsigned long psr;
- unsigned long pr;
- unsigned long caller_unat;
- unsigned long pad;
- unsigned long gr[32];
- unsigned long ar[128];
- unsigned long br[8];
- unsigned long cr[128];
- unsigned long rr[8];
- unsigned long ibr[8];
- unsigned long dbr[8];
- unsigned long pkr[8];
- struct kvm_fpreg fr[128];
- };
-};
-
-struct thash_data {
- union {
- struct {
- unsigned long p : 1; /* 0 */
- unsigned long rv1 : 1; /* 1 */
- unsigned long ma : 3; /* 2-4 */
- unsigned long a : 1; /* 5 */
- unsigned long d : 1; /* 6 */
- unsigned long pl : 2; /* 7-8 */
- unsigned long ar : 3; /* 9-11 */
- unsigned long ppn : 38; /* 12-49 */
- unsigned long rv2 : 2; /* 50-51 */
- unsigned long ed : 1; /* 52 */
- unsigned long ig1 : 11; /* 53-63 */
- };
- struct {
- unsigned long __rv1 : 53; /* 0-52 */
- unsigned long contiguous : 1; /*53 */
- unsigned long tc : 1; /* 54 TR or TC */
- unsigned long cl : 1;
- /* 55 I side or D side cache line */
- unsigned long len : 4; /* 56-59 */
- unsigned long io : 1; /* 60 entry is for io or not */
- unsigned long nomap : 1;
- /* 61 entry cann't be inserted into machine TLB.*/
- unsigned long checked : 1;
- /* 62 for VTLB/VHPT sanity check */
- unsigned long invalid : 1;
- /* 63 invalid entry */
- };
- unsigned long page_flags;
- }; /* same for VHPT and TLB */
-
- union {
- struct {
- unsigned long rv3 : 2;
- unsigned long ps : 6;
- unsigned long key : 24;
- unsigned long rv4 : 32;
- };
- unsigned long itir;
- };
- union {
- struct {
- unsigned long ig2 : 12;
- unsigned long vpn : 49;
- unsigned long vrn : 3;
- };
- unsigned long ifa;
- unsigned long vadr;
- struct {
- unsigned long tag : 63;
- unsigned long ti : 1;
- };
- unsigned long etag;
- };
- union {
- struct thash_data *next;
- unsigned long rid;
- unsigned long gpaddr;
- };
-};
-
-#define NITRS 8
-#define NDTRS 8
-
-struct saved_vpd {
- unsigned long vhpi;
- unsigned long vgr[16];
- unsigned long vbgr[16];
- unsigned long vnat;
- unsigned long vbnat;
- unsigned long vcpuid[5];
- unsigned long vpsr;
- unsigned long vpr;
- union {
- unsigned long vcr[128];
- struct {
- unsigned long dcr;
- unsigned long itm;
- unsigned long iva;
- unsigned long rsv1[5];
- unsigned long pta;
- unsigned long rsv2[7];
- unsigned long ipsr;
- unsigned long isr;
- unsigned long rsv3;
- unsigned long iip;
- unsigned long ifa;
- unsigned long itir;
- unsigned long iipa;
- unsigned long ifs;
- unsigned long iim;
- unsigned long iha;
- unsigned long rsv4[38];
- unsigned long lid;
- unsigned long ivr;
- unsigned long tpr;
- unsigned long eoi;
- unsigned long irr[4];
- unsigned long itv;
- unsigned long pmv;
- unsigned long cmcv;
- unsigned long rsv5[5];
- unsigned long lrr0;
- unsigned long lrr1;
- unsigned long rsv6[46];
- };
- };
-};
-
-struct kvm_regs {
- struct saved_vpd vpd;
- /*Arch-regs*/
- int mp_state;
- unsigned long vmm_rr;
- /* TR and TC. */
- struct thash_data itrs[NITRS];
- struct thash_data dtrs[NDTRS];
- /* Bit is set if there is a tr/tc for the region. */
- unsigned char itr_regions;
- unsigned char dtr_regions;
- unsigned char tc_regions;
-
- char irq_check;
- unsigned long saved_itc;
- unsigned long itc_check;
- unsigned long timer_check;
- unsigned long timer_pending;
- unsigned long last_itc;
-
- unsigned long vrr[8];
- unsigned long ibr[8];
- unsigned long dbr[8];
- unsigned long insvc[4]; /* Interrupt in service. */
- unsigned long xtp;
-
- unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */
- unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */
- unsigned long metaphysical_saved_rr0; /* from kvm_arch */
- unsigned long metaphysical_saved_rr4; /* from kvm_arch */
- unsigned long fp_psr; /*used for lazy float register */
- unsigned long saved_gp;
- /*for phycial emulation */
-
- union context saved_guest;
-
- unsigned long reserved[64]; /* for future use */
-};
-
-struct kvm_sregs {
-};
-
-struct kvm_fpu {
-};
-
-#define KVM_IA64_VCPU_STACK_SHIFT 16
-#define KVM_IA64_VCPU_STACK_SIZE (1UL << KVM_IA64_VCPU_STACK_SHIFT)
-
-struct kvm_ia64_vcpu_stack {
- unsigned char stack[KVM_IA64_VCPU_STACK_SIZE];
-};
-
-struct kvm_debug_exit_arch {
-};
-
-/* for KVM_SET_GUEST_DEBUG */
-struct kvm_guest_debug_arch {
-};
-
-#endif
diff --git a/linux/include/asm-ia64/kvm_host.h b/linux/include/asm-ia64/kvm_host.h
deleted file mode 100644
index 91e5de5..0000000
--- a/linux/include/asm-ia64/kvm_host.h
+++ /dev/null
@@ -1,639 +0,0 @@
-#ifndef KVM_UNIFDEF_H
-#define KVM_UNIFDEF_H
-
-#ifdef __i386__
-#ifndef CONFIG_X86_32
-#define CONFIG_X86_32 1
-#endif
-#endif
-
-#ifdef __x86_64__
-#ifndef CONFIG_X86_64
-#define CONFIG_X86_64 1
-#endif
-#endif
-
-#if defined(__i386__) || defined (__x86_64__)
-#ifndef CONFIG_X86
-#define CONFIG_X86 1
-#endif
-#endif
-
-#ifdef __ia64__
-#ifndef CONFIG_IA64
-#define CONFIG_IA64 1
-#endif
-#endif
-
-#ifdef __PPC__
-#ifndef CONFIG_PPC
-#define CONFIG_PPC 1
-#endif
-#endif
-
-#ifdef __s390__
-#ifndef CONFIG_S390
-#define CONFIG_S390 1
-#endif
-#endif
-
-#endif
-/*
- * kvm_host.h: used for kvm module, and hold ia64-specific sections.
- *
- * Copyright (C) 2007, Intel Corporation.
- *
- * Xiantao Zhang <xiantao.zhang@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- */
-
-#ifndef __ASM_KVM_HOST_H
-#define __ASM_KVM_HOST_H
-
-#define KVM_MEMORY_SLOTS 32
-/* memory slots that does not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 4
-
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
-
-/* define exit reasons from vmm to kvm*/
-#define EXIT_REASON_VM_PANIC 0
-#define EXIT_REASON_MMIO_INSTRUCTION 1
-#define EXIT_REASON_PAL_CALL 2
-#define EXIT_REASON_SAL_CALL 3
-#define EXIT_REASON_SWITCH_RR6 4
-#define EXIT_REASON_VM_DESTROY 5
-#define EXIT_REASON_EXTERNAL_INTERRUPT 6
-#define EXIT_REASON_IPI 7
-#define EXIT_REASON_PTC_G 8
-#define EXIT_REASON_DEBUG 20
-
-/*Define vmm address space and vm data space.*/
-#define KVM_VMM_SIZE (__IA64_UL_CONST(16)<<20)
-#define KVM_VMM_SHIFT 24
-#define KVM_VMM_BASE 0xD000000000000000
-#define VMM_SIZE (__IA64_UL_CONST(8)<<20)
-
-/*
- * Define vm_buffer, used by PAL Services, base address.
- * Note: vm_buffer is in the VMM-BLOCK, the size must be < 8M
- */
-#define KVM_VM_BUFFER_BASE (KVM_VMM_BASE + VMM_SIZE)
-#define KVM_VM_BUFFER_SIZE (__IA64_UL_CONST(8)<<20)
-
-/*
- * kvm guest's data area looks as follow:
- *
- * +----------------------+ ------- KVM_VM_DATA_SIZE
- * | vcpu[n]'s data | | ___________________KVM_STK_OFFSET
- * | | | / |
- * | .......... | | /vcpu's struct&stack |
- * | .......... | | /---------------------|---- 0
- * | vcpu[5]'s data | | / vpd |
- * | vcpu[4]'s data | |/-----------------------|
- * | vcpu[3]'s data | / vtlb |
- * | vcpu[2]'s data | /|------------------------|
- * | vcpu[1]'s data |/ | vhpt |
- * | vcpu[0]'s data |____________________________|
- * +----------------------+ |
- * | memory dirty log | |
- * +----------------------+ |
- * | vm's data struct | |
- * +----------------------+ |
- * | | |
- * | | |
- * | | |
- * | | |
- * | | |
- * | | |
- * | | |
- * | vm's p2m table | |
- * | | |
- * | | |
- * | | | |
- * vm's data->| | | |
- * +----------------------+ ------- 0
- * To support large memory, needs to increase the size of p2m.
- * To support more vcpus, needs to ensure it has enough space to
- * hold vcpus' data.
- */
-
-#define KVM_VM_DATA_SHIFT 26
-#define KVM_VM_DATA_SIZE (__IA64_UL_CONST(1) << KVM_VM_DATA_SHIFT)
-#define KVM_VM_DATA_BASE (KVM_VMM_BASE + KVM_VM_DATA_SIZE)
-
-#define KVM_P2M_BASE KVM_VM_DATA_BASE
-#define KVM_P2M_SIZE (__IA64_UL_CONST(24) << 20)
-
-#define VHPT_SHIFT 16
-#define VHPT_SIZE (__IA64_UL_CONST(1) << VHPT_SHIFT)
-#define VHPT_NUM_ENTRIES (__IA64_UL_CONST(1) << (VHPT_SHIFT-5))
-
-#define VTLB_SHIFT 16
-#define VTLB_SIZE (__IA64_UL_CONST(1) << VTLB_SHIFT)
-#define VTLB_NUM_ENTRIES (1UL << (VHPT_SHIFT-5))
-
-#define VPD_SHIFT 16
-#define VPD_SIZE (__IA64_UL_CONST(1) << VPD_SHIFT)
-
-#define VCPU_STRUCT_SHIFT 16
-#define VCPU_STRUCT_SIZE (__IA64_UL_CONST(1) << VCPU_STRUCT_SHIFT)
-
-/*
- * This must match KVM_IA64_VCPU_STACK_{SHIFT,SIZE} arch/ia64/include/asm/kvm.h
- */
-#define KVM_STK_SHIFT 16
-#define KVM_STK_OFFSET (__IA64_UL_CONST(1)<< KVM_STK_SHIFT)
-
-#define KVM_VM_STRUCT_SHIFT 19
-#define KVM_VM_STRUCT_SIZE (__IA64_UL_CONST(1) << KVM_VM_STRUCT_SHIFT)
-
-#define KVM_MEM_DIRY_LOG_SHIFT 19
-#define KVM_MEM_DIRTY_LOG_SIZE (__IA64_UL_CONST(1) << KVM_MEM_DIRY_LOG_SHIFT)
-
-#ifndef __ASSEMBLY__
-
-/*Define the max vcpus and memory for Guests.*/
-#define KVM_MAX_VCPUS (KVM_VM_DATA_SIZE - KVM_P2M_SIZE - KVM_VM_STRUCT_SIZE -\
- KVM_MEM_DIRTY_LOG_SIZE) / sizeof(struct kvm_vcpu_data)
-#define KVM_MAX_MEM_SIZE (KVM_P2M_SIZE >> 3 << PAGE_SHIFT)
-
-#define VMM_LOG_LEN 256
-
-#include <asm/types.h>
-#include <linux/mm.h>
-#include <linux/kvm.h>
-#include <linux/kvm_para.h>
-#include <linux/kvm_types.h>
-
-#include <asm/pal.h>
-#include <asm/sal.h>
-#include <asm/page.h>
-
-struct kvm_vcpu_data {
- char vcpu_vhpt[VHPT_SIZE];
- char vcpu_vtlb[VTLB_SIZE];
- char vcpu_vpd[VPD_SIZE];
- char vcpu_struct[VCPU_STRUCT_SIZE];
-};
-
-struct kvm_vm_data {
- char kvm_p2m[KVM_P2M_SIZE];
- char kvm_vm_struct[KVM_VM_STRUCT_SIZE];
- char kvm_mem_dirty_log[KVM_MEM_DIRTY_LOG_SIZE];
- struct kvm_vcpu_data vcpu_data[KVM_MAX_VCPUS];
-};
-
-#define VCPU_BASE(n) (KVM_VM_DATA_BASE + \
- offsetof(struct kvm_vm_data, vcpu_data[n]))
-#define KVM_VM_BASE (KVM_VM_DATA_BASE + \
- offsetof(struct kvm_vm_data, kvm_vm_struct))
-#define KVM_MEM_DIRTY_LOG_BASE KVM_VM_DATA_BASE + \
- offsetof(struct kvm_vm_data, kvm_mem_dirty_log)
-
-#define VHPT_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vhpt))
-#define VTLB_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vtlb))
-#define VPD_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vpd))
-#define VCPU_STRUCT_BASE(n) (VCPU_BASE(n) + \
- offsetof(struct kvm_vcpu_data, vcpu_struct))
-
-/*IO section definitions*/
-#define IOREQ_READ 1
-#define IOREQ_WRITE 0
-
-#define STATE_IOREQ_NONE 0
-#define STATE_IOREQ_READY 1
-#define STATE_IOREQ_INPROCESS 2
-#define STATE_IORESP_READY 3
-
-/*Guest Physical address layout.*/
-#define GPFN_MEM (0UL << 60) /* Guest pfn is normal mem */
-#define GPFN_FRAME_BUFFER (1UL << 60) /* VGA framebuffer */
-#define GPFN_LOW_MMIO (2UL << 60) /* Low MMIO range */
-#define GPFN_PIB (3UL << 60) /* PIB base */
-#define GPFN_IOSAPIC (4UL << 60) /* IOSAPIC base */
-#define GPFN_LEGACY_IO (5UL << 60) /* Legacy I/O base */
-#define GPFN_GFW (6UL << 60) /* Guest Firmware */
-#define GPFN_PHYS_MMIO (7UL << 60) /* Directed MMIO Range */
-
-#define GPFN_IO_MASK (7UL << 60) /* Guest pfn is I/O type */
-#define GPFN_INV_MASK (1UL << 63) /* Guest pfn is invalid */
-#define INVALID_MFN (~0UL)
-#define MEM_G (1UL << 30)
-#define MEM_M (1UL << 20)
-#define MMIO_START (3 * MEM_G)
-#define MMIO_SIZE (512 * MEM_M)
-#define VGA_IO_START 0xA0000UL
-#define VGA_IO_SIZE 0x20000
-#define LEGACY_IO_START (MMIO_START + MMIO_SIZE)
-#define LEGACY_IO_SIZE (64 * MEM_M)
-#define IO_SAPIC_START 0xfec00000UL
-#define IO_SAPIC_SIZE 0x100000
-#define PIB_START 0xfee00000UL
-#define PIB_SIZE 0x200000
-#define GFW_START (4 * MEM_G - 16 * MEM_M)
-#define GFW_SIZE (16 * MEM_M)
-
-/*Deliver mode, defined for ioapic.c*/
-#define dest_Fixed IOSAPIC_FIXED
-#define dest_LowestPrio IOSAPIC_LOWEST_PRIORITY
-
-#define NMI_VECTOR 2
-#define ExtINT_VECTOR 0
-#define NULL_VECTOR (-1)
-#define IA64_SPURIOUS_INT_VECTOR 0x0f
-
-#define VCPU_LID(v) (((u64)(v)->vcpu_id) << 24)
-
-/*
- *Delivery mode
- */
-#define SAPIC_DELIV_SHIFT 8
-#define SAPIC_FIXED 0x0
-#define SAPIC_LOWEST_PRIORITY 0x1
-#define SAPIC_PMI 0x2
-#define SAPIC_NMI 0x4
-#define SAPIC_INIT 0x5
-#define SAPIC_EXTINT 0x7
-
-/*
- * vcpu->requests bit members for arch
- */
-#define KVM_REQ_PTC_G 32
-#define KVM_REQ_RESUME 33
-
-#define KVM_HPAGE_GFN_SHIFT(x) 0
-#define KVM_NR_PAGE_SIZES 1
-#define KVM_PAGES_PER_HPAGE(x) 1
-
-struct kvm;
-struct kvm_vcpu;
-
-struct kvm_mmio_req {
- uint64_t addr; /* physical address */
- uint64_t size; /* size in bytes */
- uint64_t data; /* data (or paddr of data) */
- uint8_t state:4;
- uint8_t dir:1; /* 1=read, 0=write */
-};
-
-/*Pal data struct */
-struct kvm_pal_call{
- /*In area*/
- uint64_t gr28;
- uint64_t gr29;
- uint64_t gr30;
- uint64_t gr31;
- /*Out area*/
- struct ia64_pal_retval ret;
-};
-
-/* Sal data structure */
-struct kvm_sal_call{
- /*In area*/
- uint64_t in0;
- uint64_t in1;
- uint64_t in2;
- uint64_t in3;
- uint64_t in4;
- uint64_t in5;
- uint64_t in6;
- uint64_t in7;
- struct sal_ret_values ret;
-};
-
-/*Guest change rr6*/
-struct kvm_switch_rr6 {
- uint64_t old_rr;
- uint64_t new_rr;
-};
-
-union ia64_ipi_a{
- unsigned long val;
- struct {
- unsigned long rv : 3;
- unsigned long ir : 1;
- unsigned long eid : 8;
- unsigned long id : 8;
- unsigned long ib_base : 44;
- };
-};
-
-union ia64_ipi_d {
- unsigned long val;
- struct {
- unsigned long vector : 8;
- unsigned long dm : 3;
- unsigned long ig : 53;
- };
-};
-
-/*ipi check exit data*/
-struct kvm_ipi_data{
- union ia64_ipi_a addr;
- union ia64_ipi_d data;
-};
-
-/*global purge data*/
-struct kvm_ptc_g {
- unsigned long vaddr;
- unsigned long rr;
- unsigned long ps;
- struct kvm_vcpu *vcpu;
-};
-
-/*Exit control data */
-struct exit_ctl_data{
- uint32_t exit_reason;
- uint32_t vm_status;
- union {
- struct kvm_mmio_req ioreq;
- struct kvm_pal_call pal_data;
- struct kvm_sal_call sal_data;
- struct kvm_switch_rr6 rr_data;
- struct kvm_ipi_data ipi_data;
- struct kvm_ptc_g ptc_g_data;
- } u;
-};
-
-union pte_flags {
- unsigned long val;
- struct {
- unsigned long p : 1; /*0 */
- unsigned long : 1; /* 1 */
- unsigned long ma : 3; /* 2-4 */
- unsigned long a : 1; /* 5 */
- unsigned long d : 1; /* 6 */
- unsigned long pl : 2; /* 7-8 */
- unsigned long ar : 3; /* 9-11 */
- unsigned long ppn : 38; /* 12-49 */
- unsigned long : 2; /* 50-51 */
- unsigned long ed : 1; /* 52 */
- };
-};
-
-union ia64_pta {
- unsigned long val;
- struct {
- unsigned long ve : 1;
- unsigned long reserved0 : 1;
- unsigned long size : 6;
- unsigned long vf : 1;
- unsigned long reserved1 : 6;
- unsigned long base : 49;
- };
-};
-
-struct thash_cb {
- /* THASH base information */
- struct thash_data *hash; /* hash table pointer */
- union ia64_pta pta;
- int num;
-};
-
-struct kvm_vcpu_stat {
-};
-
-struct kvm_vcpu_arch {
- int launched;
- int last_exit;
- int last_run_cpu;
- int vmm_tr_slot;
- int vm_tr_slot;
- int sn_rtc_tr_slot;
-
-#define KVM_MP_STATE_RUNNABLE 0
-#define KVM_MP_STATE_UNINITIALIZED 1
-#define KVM_MP_STATE_INIT_RECEIVED 2
-#define KVM_MP_STATE_HALTED 3
- int mp_state;
-
-#define MAX_PTC_G_NUM 3
- int ptc_g_count;
- struct kvm_ptc_g ptc_g_data[MAX_PTC_G_NUM];
-
- /*halt timer to wake up sleepy vcpus*/
- struct hrtimer hlt_timer;
- long ht_active;
-
- struct kvm_lapic *apic; /* kernel irqchip context */
- struct vpd *vpd;
-
- /* Exit data for vmm_transition*/
- struct exit_ctl_data exit_data;
-
- cpumask_t cache_coherent_map;
-
- unsigned long vmm_rr;
- unsigned long host_rr6;
- unsigned long psbits[8];
- unsigned long cr_iipa;
- unsigned long cr_isr;
- unsigned long vsa_base;
- unsigned long dirty_log_lock_pa;
- unsigned long __gp;
- /* TR and TC. */
- struct thash_data itrs[NITRS];
- struct thash_data dtrs[NDTRS];
- /* Bit is set if there is a tr/tc for the region. */
- unsigned char itr_regions;
- unsigned char dtr_regions;
- unsigned char tc_regions;
- /* purge all */
- unsigned long ptce_base;
- unsigned long ptce_count[2];
- unsigned long ptce_stride[2];
- /* itc/itm */
- unsigned long last_itc;
- long itc_offset;
- unsigned long itc_check;
- unsigned long timer_check;
- unsigned int timer_pending;
- unsigned int timer_fired;
-
- unsigned long vrr[8];
- unsigned long ibr[8];
- unsigned long dbr[8];
- unsigned long insvc[4]; /* Interrupt in service. */
- unsigned long xtp;
-
- unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */
- unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */
- unsigned long metaphysical_saved_rr0; /* from kvm_arch */
- unsigned long metaphysical_saved_rr4; /* from kvm_arch */
- unsigned long fp_psr; /*used for lazy float register */
- unsigned long saved_gp;
- /*for phycial emulation */
- int mode_flags;
- struct thash_cb vtlb;
- struct thash_cb vhpt;
- char irq_check;
- char irq_new_pending;
-
- unsigned long opcode;
- unsigned long cause;
- char log_buf[VMM_LOG_LEN];
- union context host;
- union context guest;
-};
-
-struct kvm_vm_stat {
- u64 remote_tlb_flush;
-};
-
-struct kvm_sal_data {
- unsigned long boot_ip;
- unsigned long boot_gp;
-};
-
-struct kvm_arch {
- spinlock_t dirty_log_lock;
-
- unsigned long vm_base;
- unsigned long metaphysical_rr0;
- unsigned long metaphysical_rr4;
- unsigned long vmm_init_rr;
-
- int is_sn2;
-
- struct kvm_ioapic *vioapic;
- struct kvm_vm_stat stat;
- struct kvm_sal_data rdv_sal_data;
-
- struct list_head assigned_dev_head;
- struct iommu_domain *iommu_domain;
- int iommu_flags;
-
- unsigned long irq_sources_bitmap;
- unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
-};
-
-union cpuid3_t {
- u64 value;
- struct {
- u64 number : 8;
- u64 revision : 8;
- u64 model : 8;
- u64 family : 8;
- u64 archrev : 8;
- u64 rv : 24;
- };
-};
-
-struct kvm_pt_regs {
- /* The following registers are saved by SAVE_MIN: */
- unsigned long b6; /* scratch */
- unsigned long b7; /* scratch */
-
- unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */
- unsigned long ar_ssd; /* reserved for future use (scratch) */
-
- unsigned long r8; /* scratch (return value register 0) */
- unsigned long r9; /* scratch (return value register 1) */
- unsigned long r10; /* scratch (return value register 2) */
- unsigned long r11; /* scratch (return value register 3) */
-
- unsigned long cr_ipsr; /* interrupted task's psr */
- unsigned long cr_iip; /* interrupted task's instruction pointer */
- unsigned long cr_ifs; /* interrupted task's function state */
-
- unsigned long ar_unat; /* interrupted task's NaT register (preserved) */
- unsigned long ar_pfs; /* prev function state */
- unsigned long ar_rsc; /* RSE configuration */
- /* The following two are valid only if cr_ipsr.cpl > 0: */
- unsigned long ar_rnat; /* RSE NaT */
- unsigned long ar_bspstore; /* RSE bspstore */
-
- unsigned long pr; /* 64 predicate registers (1 bit each) */
- unsigned long b0; /* return pointer (bp) */
- unsigned long loadrs; /* size of dirty partition << 16 */
-
- unsigned long r1; /* the gp pointer */
- unsigned long r12; /* interrupted task's memory stack pointer */
- unsigned long r13; /* thread pointer */
-
- unsigned long ar_fpsr; /* floating point status (preserved) */
- unsigned long r15; /* scratch */
-
- /* The remaining registers are NOT saved for system calls. */
- unsigned long r14; /* scratch */
- unsigned long r2; /* scratch */
- unsigned long r3; /* scratch */
- unsigned long r16; /* scratch */
- unsigned long r17; /* scratch */
- unsigned long r18; /* scratch */
- unsigned long r19; /* scratch */
- unsigned long r20; /* scratch */
- unsigned long r21; /* scratch */
- unsigned long r22; /* scratch */
- unsigned long r23; /* scratch */
- unsigned long r24; /* scratch */
- unsigned long r25; /* scratch */
- unsigned long r26; /* scratch */
- unsigned long r27; /* scratch */
- unsigned long r28; /* scratch */
- unsigned long r29; /* scratch */
- unsigned long r30; /* scratch */
- unsigned long r31; /* scratch */
- unsigned long ar_ccv; /* compare/exchange value (scratch) */
-
- /*
- * Floating point registers that the kernel considers scratch:
- */
- struct ia64_fpreg f6; /* scratch */
- struct ia64_fpreg f7; /* scratch */
- struct ia64_fpreg f8; /* scratch */
- struct ia64_fpreg f9; /* scratch */
- struct ia64_fpreg f10; /* scratch */
- struct ia64_fpreg f11; /* scratch */
-
- unsigned long r4; /* preserved */
- unsigned long r5; /* preserved */
- unsigned long r6; /* preserved */
- unsigned long r7; /* preserved */
- unsigned long eml_unat; /* used for emulating instruction */
- unsigned long pad0; /* alignment pad */
-};
-
-static inline struct kvm_pt_regs *vcpu_regs(struct kvm_vcpu *v)
-{
- return (struct kvm_pt_regs *) ((unsigned long) v + KVM_STK_OFFSET) - 1;
-}
-
-typedef int kvm_vmm_entry(void);
-typedef void kvm_tramp_entry(union context *host, union context *guest);
-
-struct kvm_vmm_info{
- struct module *module;
- kvm_vmm_entry *vmm_entry;
- kvm_tramp_entry *tramp_entry;
- unsigned long vmm_ivt;
- unsigned long patch_mov_ar;
- unsigned long patch_mov_ar_sn2;
-};
-
-int kvm_highest_pending_irq(struct kvm_vcpu *vcpu);
-int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
-void kvm_sal_emul(struct kvm_vcpu *vcpu);
-
-#define __KVM_HAVE_ARCH_VM_ALLOC 1
-struct kvm *kvm_arch_alloc_vm(void);
-void kvm_arch_free_vm(struct kvm *kvm);
-
-#endif /* __ASSEMBLY__*/
-
-#endif
diff --git a/linux/include/asm-ia64/kvm_para.h b/linux/include/asm-ia64/kvm_para.h
deleted file mode 100644
index 2e2f499..0000000
--- a/linux/include/asm-ia64/kvm_para.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef KVM_UNIFDEF_H
-#define KVM_UNIFDEF_H
-
-#ifdef __i386__
-#ifndef CONFIG_X86_32
-#define CONFIG_X86_32 1
-#endif
-#endif
-
-#ifdef __x86_64__
-#ifndef CONFIG_X86_64
-#define CONFIG_X86_64 1
-#endif
-#endif
-
-#if defined(__i386__) || defined (__x86_64__)
-#ifndef CONFIG_X86
-#define CONFIG_X86 1
-#endif
-#endif
-
-#ifdef __ia64__
-#ifndef CONFIG_IA64
-#define CONFIG_IA64 1
-#endif
-#endif
-
-#ifdef __PPC__
-#ifndef CONFIG_PPC
-#define CONFIG_PPC 1
-#endif
-#endif
-
-#ifdef __s390__
-#ifndef CONFIG_S390
-#define CONFIG_S390 1
-#endif
-#endif
-
-#endif
-#ifndef __IA64_KVM_PARA_H
-#define __IA64_KVM_PARA_H
-
-/*
- * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- */
-
-#ifdef __KERNEL__
-
-static inline unsigned int kvm_arch_para_features(void)
-{
- return 0;
-}
-
-#endif
-
-#endif
diff --git a/linux/include/asm-x86/hyperv.h b/linux/include/asm-x86/hyperv.h
index 8b44b46..e33dcc8 100644
--- a/linux/include/asm-x86/hyperv.h
+++ b/linux/include/asm-x86/hyperv.h
@@ -38,8 +38,8 @@
#endif
#endif
-#ifndef _ASM_X86_HYPERV_H
-#define _ASM_X86_HYPERV_H
+#ifndef _ASM_X86_KVM_HYPERV_H
+#define _ASM_X86_KVM_HYPERV_H
#include <linux/types.h>
@@ -54,10 +54,6 @@
#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
-#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
-#define HYPERV_CPUID_MIN 0x40000005
-#define HYPERV_CPUID_MAX 0x4000ffff
-
/*
* Feature identification. EAX indicates which features are available
* to the partition based upon the current partition privileges.
@@ -173,9 +169,6 @@
/* MSR used to provide vcpu index */
#define HV_X64_MSR_VP_INDEX 0x40000002
-/* MSR used to read the per-partition time reference counter */
-#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
-
/* Define the virtual APIC registers */
#define HV_X64_MSR_EOI 0x40000070
#define HV_X64_MSR_ICR 0x40000071
diff --git a/linux/include/asm-x86/kvm.h b/linux/include/asm-x86/kvm.h
index 12ddb51..7cf06d2 100644
--- a/linux/include/asm-x86/kvm.h
+++ b/linux/include/asm-x86/kvm.h
@@ -61,9 +61,6 @@
#define __KVM_HAVE_PIT_STATE2
#define __KVM_HAVE_XEN_HVM
#define __KVM_HAVE_VCPU_EVENTS
-#define __KVM_HAVE_DEBUGREGS
-#define __KVM_HAVE_XSAVE
-#define __KVM_HAVE_XCRS
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -300,11 +297,6 @@ struct kvm_reinject_control {
/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
-#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
-
-/* Interrupt shadow states */
-#define KVM_X86_SHADOW_INT_MOV_SS 0x01
-#define KVM_X86_SHADOW_INT_STI 0x02
/* for KVM_GET/SET_VCPU_EVENTS */
struct kvm_vcpu_events {
@@ -319,7 +311,7 @@ struct kvm_vcpu_events {
__u8 injected;
__u8 nr;
__u8 soft;
- __u8 shadow;
+ __u8 pad;
} interrupt;
struct {
__u8 injected;
@@ -332,33 +324,4 @@ struct kvm_vcpu_events {
__u32 reserved[10];
};
-/* for KVM_GET/SET_DEBUGREGS */
-struct kvm_debugregs {
- __u64 db[4];
- __u64 dr6;
- __u64 dr7;
- __u64 flags;
- __u64 reserved[9];
-};
-
-/* for KVM_CAP_XSAVE */
-struct kvm_xsave {
- __u32 region[1024];
-};
-
-#define KVM_MAX_XCRS 16
-
-struct kvm_xcr {
- __u32 xcr;
- __u32 reserved;
- __u64 value;
-};
-
-struct kvm_xcrs {
- __u32 nr_xcrs;
- __u32 flags;
- struct kvm_xcr xcrs[KVM_MAX_XCRS];
- __u64 padding[16];
-};
-
#endif /* _ASM_X86_KVM_H */
diff --git a/linux/include/asm-x86/kvm_emulate.h b/linux/include/asm-x86/kvm_emulate.h
index a57ee7e..6f7d0f6 100644
--- a/linux/include/asm-x86/kvm_emulate.h
+++ b/linux/include/asm-x86/kvm_emulate.h
@@ -51,18 +51,8 @@
#ifndef _ASM_X86_KVM_X86_EMULATE_H
#define _ASM_X86_KVM_X86_EMULATE_H
-
-
struct x86_emulate_ctxt;
-struct x86_exception {
- u8 vector;
- bool error_code_valid;
- u16 error_code;
- bool nested_page_fault;
- u64 address; /* cr2 or nested page fault gpa */
-};
-
/*
* x86_emulate_ops:
*
@@ -99,10 +89,8 @@ struct x86_exception {
#define X86EMUL_UNHANDLEABLE 1
/* Terminate emulation but return success to the caller. */
#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
-#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
-
+#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
struct x86_emulate_ops {
/*
* read_std: Read bytes of standard (non-emulated/special) memory.
@@ -112,20 +100,9 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu,
- struct x86_exception *fault);
+ unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
/*
- * write_std: Write bytes of standard (non-emulated/special) memory.
- * Used for descriptor writing.
- * @addr: [IN ] Linear address to which to write.
- * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
- * @bytes: [IN ] Number of bytes to write to memory.
- */
- int (*write_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu,
- struct x86_exception *fault);
- /*
* fetch: Read bytes of standard (non-emulated/special) memory.
* Used for instruction fetch.
* @addr: [IN ] Linear address from which to read.
@@ -133,8 +110,7 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*fetch)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu,
- struct x86_exception *fault);
+ unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
/*
* read_emulated: Read bytes from emulated/special memory area.
@@ -145,7 +121,6 @@ struct x86_emulate_ops {
int (*read_emulated)(unsigned long addr,
void *val,
unsigned int bytes,
- struct x86_exception *fault,
struct kvm_vcpu *vcpu);
/*
@@ -158,7 +133,6 @@ struct x86_emulate_ops {
int (*write_emulated)(unsigned long addr,
const void *val,
unsigned int bytes,
- struct x86_exception *fault,
struct kvm_vcpu *vcpu);
/*
@@ -173,53 +147,15 @@ struct x86_emulate_ops {
const void *old,
const void *new,
unsigned int bytes,
- struct x86_exception *fault,
struct kvm_vcpu *vcpu);
- int (*pio_in_emulated)(int size, unsigned short port, void *val,
- unsigned int count, struct kvm_vcpu *vcpu);
-
- int (*pio_out_emulated)(int size, unsigned short port, const void *val,
- unsigned int count, struct kvm_vcpu *vcpu);
-
- bool (*get_cached_descriptor)(struct kvm_desc_struct *desc,
- int seg, struct kvm_vcpu *vcpu);
- void (*set_cached_descriptor)(struct kvm_desc_struct *desc,
- int seg, struct kvm_vcpu *vcpu);
- u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
- void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
- unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
- void (*get_gdt)(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu);
- void (*get_idt)(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu);
- ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
- int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
- int (*cpl)(struct kvm_vcpu *vcpu);
- int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
- int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
- int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
- int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
};
/* Type, address-of, and value of an instruction's operand. */
struct operand {
enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
unsigned int bytes;
- union {
- unsigned long orig_val;
- u64 orig_val64;
- };
- union {
- unsigned long *reg;
- struct segmented_address {
- ulong ea;
- unsigned seg;
- } mem;
- } addr;
- union {
- unsigned long val;
- u64 val64;
- char valptr[sizeof(unsigned long) + 2];
- };
+ unsigned long val, orig_val, *ptr;
};
struct fetch_cache {
@@ -228,12 +164,6 @@ struct fetch_cache {
unsigned long end;
};
-struct read_cache {
- u8 data[1024];
- unsigned long pos;
- unsigned long end;
-};
-
struct decode_cache {
u8 twobyte;
u8 b;
@@ -248,29 +178,29 @@ struct decode_cache {
bool has_seg_override;
u8 seg_override;
unsigned int d;
- int (*execute)(struct x86_emulate_ctxt *ctxt);
unsigned long regs[NR_VCPU_REGS];
- unsigned long eip;
+ unsigned long eip, eip_orig;
/* modrm */
u8 modrm;
u8 modrm_mod;
u8 modrm_reg;
u8 modrm_rm;
- u8 modrm_seg;
+ u8 use_modrm_ea;
bool rip_relative;
+ unsigned long modrm_ea;
+ void *modrm_ptr;
+ unsigned long modrm_val;
struct fetch_cache fetch;
- struct read_cache io_read;
- struct read_cache mem_read;
};
-struct x86_emulate_ctxt {
- struct x86_emulate_ops *ops;
+#define X86_SHADOW_INT_MOV_SS 1
+#define X86_SHADOW_INT_STI 2
+struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
unsigned long eflags;
- unsigned long eip; /* eip before instruction emulation */
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
int mode;
u32 cs_base;
@@ -278,11 +208,6 @@ struct x86_emulate_ctxt {
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
- bool perm_ok; /* do not check permissions if true */
-
- bool have_exception;
- struct x86_exception exception;
-
/* decode cache */
struct decode_cache decode;
};
@@ -305,14 +230,9 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
-#define EMULATION_FAILED -1
-#define EMULATION_OK 0
-#define EMULATION_RESTART 1
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
-int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, int reason,
- bool has_error_code, u32 error_code);
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int irq);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops);
+
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/linux/include/asm-x86/kvm_host.h b/linux/include/asm-x86/kvm_host.h
index 0c5e1f7..3090d71 100644
--- a/linux/include/asm-x86/kvm_host.h
+++ b/linux/include/asm-x86/kvm_host.h
@@ -55,7 +55,6 @@
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/tracepoint.h>
-#include <linux/cpumask.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -80,14 +79,11 @@
0xFFFFFF0000000000ULL)
#define INVALID_PAGE (~(hpa_t)0)
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
#define UNMAPPED_GVA (~(gpa_t)0)
/* KVM Hugepage definitions for x86 */
#define KVM_NR_PAGE_SIZES 3
-#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
-#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9))
#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
@@ -113,24 +109,23 @@
#define IOPL_SHIFT 12
+#define KVM_ALIAS_SLOTS 4
+
#define KVM_PERMILLE_MMU_PAGES 20
#define KVM_MIN_ALLOC_MMU_PAGES 64
#define KVM_MMU_HASH_SHIFT 10
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 80
+#define KVM_MAX_CPUID_ENTRIES 40
#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
-#define ASYNC_PF_PER_VCPU 64
-
extern spinlock_t kvm_lock;
extern struct list_head vm_list;
struct kvm_vcpu;
struct kvm;
-struct kvm_async_pf;
enum kvm_reg {
VCPU_REGS_RAX = 0,
@@ -157,7 +152,6 @@ enum kvm_reg {
enum kvm_reg_ex {
VCPU_EXREG_PDPTR = NR_VCPU_REGS,
- VCPU_EXREG_CR3,
};
enum {
@@ -217,15 +211,15 @@ struct kvm_pte_chain {
union kvm_mmu_page_role {
unsigned word;
struct {
+ unsigned glevels:4;
unsigned level:4;
- unsigned cr4_pae:1;
unsigned quadrant:2;
unsigned pad_for_nice_hex_output:6;
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
+ unsigned cr4_pge:1;
unsigned nxe:1;
- unsigned cr0_wp:1;
};
};
@@ -233,6 +227,8 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
+ struct list_head oos_link;
+
/*
* The following two entries are used to key the shadow page in the
* hash table.
@@ -248,9 +244,9 @@ struct kvm_mmu_page {
* in this shadow page.
*/
DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- bool multimapped; /* More than one parent_pte? */
- bool unsync;
+ int multimapped; /* More than one parent_pte? */
int root_count; /* Currently serving as active root */
+ bool unsync;
unsigned int unsync_children;
union {
u64 *parent_pte; /* !multimapped */
@@ -268,9 +264,14 @@ struct kvm_pv_mmu_op_buffer {
struct kvm_pio_request {
unsigned long count;
+ int cur_count;
+ gva_t guest_gva;
int in;
int port;
int size;
+ int string;
+ int down;
+ int rep;
};
/*
@@ -280,16 +281,10 @@ struct kvm_pio_request {
*/
struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
- void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
- unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
- bool prefault);
- void (*inject_page_fault)(struct kvm_vcpu *vcpu,
- struct x86_exception *fault);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
- struct x86_exception *exception);
- gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
+ u32 *error);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -299,18 +294,13 @@ struct kvm_mmu {
int root_level;
int shadow_root_level;
union kvm_mmu_page_role base_role;
- bool direct_map;
u64 *pae_root;
- u64 *lm_root;
u64 rsvd_bits_mask[2][4];
-
- bool nx;
-
- u64 pdptrs[4]; /* pae */
};
struct kvm_vcpu_arch {
+ u64 host_tsc;
/*
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
@@ -327,6 +317,7 @@ struct kvm_vcpu_arch {
unsigned long cr4_guest_owned_bits;
unsigned long cr8;
u32 hflags;
+ u64 pdptrs[4]; /* pae */
u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
@@ -336,31 +327,7 @@ struct kvm_vcpu_arch {
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
- /*
- * Paging state of the vcpu
- *
- * If the vcpu runs in guest mode with two level paging this still saves
- * the paging mode of the l1 guest. This context is always used to
- * handle faults.
- */
struct kvm_mmu mmu;
-
- /*
- * Paging state of an L2 guest (used for nested npt)
- *
- * This context will save all necessary information to walk page tables
- * of the an L2 guest. This context is only initialized for page table
- * walking and not for faulting since we never handle l2 page faults on
- * the host.
- */
- struct kvm_mmu nested_mmu;
-
- /*
- * Pointer to the mmu context currently used for
- * gva_to_gpa translations.
- */
- struct kvm_mmu *walk_mmu;
-
/* only needed in kvm_pv_mmu_op() path, but it's hot so
* put it here to avoid allocation */
struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -381,8 +348,8 @@ struct kvm_vcpu_arch {
unsigned long mmu_seq;
} update_pte;
- struct kvm_compat_fpu guest_fpu;
- u64 xcr0;
+ struct i387_fxsave_struct host_fx_image;
+ struct i387_fxsave_struct guest_fx_image;
gva_t mmio_fault_cr2;
struct kvm_pio_request pio;
@@ -393,7 +360,6 @@ struct kvm_vcpu_arch {
struct kvm_queued_exception {
bool pending;
bool has_error_code;
- bool reinject;
u8 nr;
u32 error_code;
} exception;
@@ -413,16 +379,10 @@ struct kvm_vcpu_arch {
struct x86_emulate_ctxt emulate_ctxt;
gpa_t time;
- struct kvm_pvclock_vcpu_time_info hv_clock;
- unsigned int hw_tsc_khz;
+ struct pvclock_vcpu_time_info hv_clock;
+ unsigned int hv_clock_tsc_khz;
unsigned int time_offset;
struct page *time_page;
- u64 last_host_tsc;
- u64 last_guest_tsc;
- u64 last_kernel_ns;
- u64 last_tsc_nsec;
- u64 last_tsc_write;
- bool tsc_catchup;
bool nmi_pending;
bool nmi_injected;
@@ -442,28 +402,33 @@ struct kvm_vcpu_arch {
u64 *mce_banks;
/* used for guest single stepping over the given code position */
+ u16 singlestep_cs;
unsigned long singlestep_rip;
-
/* fields used by HYPER-V emulation */
u64 hv_vapic;
+};
- cpumask_var_t wbinvd_dirty_mask;
+struct kvm_mem_alias {
+ gfn_t base_gfn;
+ unsigned long npages;
+ gfn_t target_gfn;
+#define KVM_ALIAS_INVALID 1UL
+ unsigned long flags;
+};
- struct {
- bool halted;
- gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
- struct gfn_to_hva_cache data;
- u64 msr_val;
- u32 id;
- bool send_user_only;
- } apf;
+#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+
+struct kvm_mem_aliases {
+ struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
+ int naliases;
};
struct kvm_arch {
- unsigned int n_used_mmu_pages;
+ struct kvm_mem_aliases *aliases;
+
+ unsigned int n_free_mmu_pages;
unsigned int n_requested_mmu_pages;
- unsigned int n_max_mmu_pages;
- atomic_t invlpg_counter;
+ unsigned int n_alloc_mmu_pages;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
@@ -487,24 +452,14 @@ struct kvm_arch {
gpa_t ept_identity_map_addr;
unsigned long irq_sources_bitmap;
+ u64 vm_init_tsc;
s64 kvmclock_offset;
- spinlock_t tsc_write_lock;
- u64 last_tsc_nsec;
- u64 last_tsc_offset;
- u64 last_tsc_write;
- u32 virtual_tsc_khz;
- u32 virtual_tsc_mult;
- s8 virtual_tsc_shift;
struct kvm_xen_hvm_config xen_hvm_config;
/* fields used by HYPER-V emulation */
u64 hv_guest_os_id;
u64 hv_hypercall;
-
- #ifdef CONFIG_KVM_MMU_AUDIT
- int audit_point;
- #endif
};
struct kvm_vm_stat {
@@ -546,6 +501,11 @@ struct kvm_vcpu_stat {
u32 nmi_injections;
};
+struct descriptor_table {
+ u16 limit;
+ unsigned long base;
+} __attribute__((packed));
+
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -578,17 +538,17 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
- void (*decache_cr3)(struct kvm_vcpu *vcpu);
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
- void (*get_idt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt);
- void (*set_idt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt);
- void (*get_gdt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt);
- void (*set_gdt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt);
- void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
+ int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -607,9 +567,7 @@ struct kvm_x86_ops {
void (*set_irq)(struct kvm_vcpu *vcpu);
void (*set_nmi)(struct kvm_vcpu *vcpu);
void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code,
- bool reinject);
- void (*cancel_injection)(struct kvm_vcpu *vcpu);
+ bool has_error_code, u32 error_code);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
int (*nmi_allowed)(struct kvm_vcpu *vcpu);
bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -622,27 +580,10 @@ struct kvm_x86_ops {
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
- void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
-
- void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
-
- void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
- bool (*has_wbinvd_exit)(void);
-
- void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
-
- void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
const struct trace_print_flags *exit_reasons_str;
};
-struct kvm_arch_async_pf {
- u32 token;
- gfn_t gfn;
- unsigned long cr3;
- bool direct_map;
-};
-
extern struct kvm_x86_ops *kvm_x86_ops;
int kvm_mmu_module_init(void);
@@ -652,6 +593,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+void kvm_mmu_set_base_ptes(u64 base_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
@@ -661,7 +603,7 @@ void kvm_mmu_zap_all(struct kvm *kvm);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
@@ -680,47 +622,49 @@ enum emulation_result {
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2)
-int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
- int emulation_type, void *insn, int insn_len);
-
-static inline int emulate_instruction(struct kvm_vcpu *vcpu,
- int emulation_type)
-{
- return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
-}
-
+int emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2, u16 error_code, int emulation_type);
+void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+ unsigned long *rflags);
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
+ unsigned long *rflags);
void kvm_enable_efer_bits(u64);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
struct x86_emulate_ctxt;
-int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
+ int size, unsigned port);
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
+ int size, unsigned long count, int down,
+ gva_t address, int rep, unsigned port);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
int emulate_clts(struct kvm_vcpu *vcpu);
-int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest);
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long value);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
- bool has_error_code, u32 error_code);
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
-int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -730,20 +674,22 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
-void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
-int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gfn_t gfn, void *data, int offset, int len,
- u32 access);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
+ u32 error_code);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
int kvm_pic_set_irq(void *opaque, int irq, int level);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
-int fx_init(struct kvm_vcpu *vcpu);
+void fx_init(struct kvm_vcpu *vcpu);
+
+int emulator_write_emulated(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+unsigned long segment_base(u16 selector);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -754,29 +700,27 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception);
-gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception);
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception);
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
- void *insn, int insn_len);
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_enable_tdp(void);
void kvm_disable_tdp(void);
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int complete_pio(struct kvm_vcpu *vcpu);
bool kvm_check_iopl(struct kvm_vcpu *vcpu);
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
+
static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
{
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -784,6 +728,20 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
return (struct kvm_mmu_page *)page_private(page);
}
+static inline u16 kvm_read_fs(void)
+{
+ u16 seg;
+ asm("mov %%fs, %0" : "=g"(seg));
+ return seg;
+}
+
+static inline u16 kvm_read_gs(void)
+{
+ u16 seg;
+ asm("mov %%gs, %0" : "=g"(seg));
+ return seg;
+}
+
static inline u16 kvm_read_ldt(void)
{
u16 ldt;
@@ -791,11 +749,38 @@ static inline u16 kvm_read_ldt(void)
return ldt;
}
+static inline void kvm_load_fs(u16 sel)
+{
+ asm("mov %0, %%fs" : : "rm"(sel));
+}
+
+static inline void kvm_load_gs(u16 sel)
+{
+ asm("mov %0, %%gs" : : "rm"(sel));
+}
+
static inline void kvm_load_ldt(u16 sel)
{
asm("lldt %0" : : "rm"(sel));
}
+static inline void kvm_get_idt(struct descriptor_table *table)
+{
+ asm("sidt %0" : "=m"(*table));
+}
+
+static inline void kvm_get_gdt(struct descriptor_table *table)
+{
+ asm("sgdt %0" : "=m"(*table));
+}
+
+static inline unsigned long kvm_read_tr_base(void)
+{
+ u16 tr;
+ asm("str %0" : "=g"(tr));
+ return segment_base(tr);
+}
+
#ifdef CONFIG_X86_64
static inline unsigned long read_msr(unsigned long msr)
{
@@ -806,6 +791,21 @@ static inline unsigned long read_msr(unsigned long msr)
}
#endif
+static inline void kvm_fx_save(struct i387_fxsave_struct *image)
+{
+ asm("fxsave (%0)":: "r" (image));
+}
+
+static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
+{
+ asm("fxrstor (%0)":: "r" (image));
+}
+
+static inline void kvm_fx_finit(void)
+{
+ asm("finit");
+}
+
static inline u32 get_rdx_init_val(void)
{
return 0x600; /* P6 family */
@@ -835,25 +835,20 @@ enum {
#define HF_VINTR_MASK (1 << 2)
#define HF_NMI_MASK (1 << 3)
#define HF_IRET_MASK (1 << 4)
-#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
/*
* Hardware virtualization extension instructions may fault if a
* reboot turns off virtualization while processes are running.
* Trap the fault and ignore the instruction if that happens.
*/
-asmlinkage void kvm_spurious_fault(void);
-extern bool kvm_rebooting;
+asmlinkage void kvm_handle_fault_on_reboot(void);
#define __kvm_handle_fault_on_reboot(insn) \
"666: " insn "\n\t" \
- "668: \n\t" \
".pushsection .fixup, \"ax\" \n" \
"667: \n\t" \
- "cmpb $0, kvm_rebooting \n\t" \
- "jne 668b \n\t" \
__ASM_SIZE(push) " $666b \n\t" \
- "call kvm_spurious_fault \n\t" \
+ "jmp kvm_handle_fault_on_reboot \n\t" \
".popsection \n\t" \
".pushsection __ex_table, \"a\" \n\t" \
_ASM_PTR " 666b, 667b \n\t" \
@@ -862,7 +857,6 @@ extern bool kvm_rebooting;
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
@@ -872,17 +866,4 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_define_shared_msr(unsigned index, u32 msr);
void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
-bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
-
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
- struct kvm_async_pf *work);
-void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
- struct kvm_async_pf *work);
-void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
- struct kvm_async_pf *work);
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
-extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
-
-void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
-
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/linux/include/asm-x86/kvm_para.h b/linux/include/asm-x86/kvm_para.h
index 5f575a4..a1a5c1f 100644
--- a/linux/include/asm-x86/kvm_para.h
+++ b/linux/include/asm-x86/kvm_para.h
@@ -56,30 +56,12 @@
#define KVM_FEATURE_CLOCKSOURCE 0
#define KVM_FEATURE_NOP_IO_DELAY 1
#define KVM_FEATURE_MMU_OP 2
-/* This indicates that the new set of kvmclock msrs
- * are available. The use of 0x11 and 0x12 is deprecated
- */
-#define KVM_FEATURE_CLOCKSOURCE2 3
-#define KVM_FEATURE_ASYNC_PF 4
-
-/* The last 8 bits are used to indicate how to interpret the flags field
- * in pvclock structure. If no bits are set, all flags are ignored.
- */
-#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
-/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
-#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
-#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
-#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
-
#define KVM_MAX_MMU_OP_BATCH 32
-#define KVM_ASYNC_PF_ENABLED (1 << 0)
-#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
-
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
#define KVM_MMU_OP_FLUSH_TLB 2
@@ -106,20 +88,10 @@ struct kvm_mmu_op_release_pt {
__u64 pt_phys;
};
-#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
-#define KVM_PV_REASON_PAGE_READY 2
-
-struct kvm_vcpu_pv_apf_data {
- __u32 reason;
- __u8 pad[60];
- __u32 enabled;
-};
-
#ifdef __KERNEL__
#include <asm/processor.h>
extern void kvmclock_init(void);
-extern int kvm_register_clock(char *txt);
/* This instruction is vmcall. On non-VT architectures, it will generate a
@@ -213,21 +185,6 @@ static inline unsigned int kvm_arch_para_features(void)
return cpuid_eax(KVM_CPUID_FEATURES);
}
-#ifdef CONFIG_KVM_GUEST
-void __init kvm_guest_init(void);
-void kvm_async_pf_task_wait(u32 token);
-void kvm_async_pf_task_wake(u32 token);
-u32 kvm_read_and_reset_pf_reason(void);
-#else
-#define kvm_guest_init() do { } while (0)
-#define kvm_async_pf_task_wait(T) do {} while(0)
-#define kvm_async_pf_task_wake(T) do {} while(0)
-static inline u32 kvm_read_and_reset_pf_reason(void)
-{
- return 0;
-}
#endif
-#endif /* __KERNEL__ */
-
#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/linux/include/asm-x86/svm.h b/linux/include/asm-x86/svm.h
index 5529318..b7f0e58 100644
--- a/linux/include/asm-x86/svm.h
+++ b/linux/include/asm-x86/svm.h
@@ -87,13 +87,14 @@ enum {
INTERCEPT_MONITOR,
INTERCEPT_MWAIT,
INTERCEPT_MWAIT_COND,
- INTERCEPT_XSETBV,
};
struct __attribute__ ((__packed__)) vmcb_control_area {
- u32 intercept_cr;
- u32 intercept_dr;
+ u16 intercept_cr_read;
+ u16 intercept_cr_write;
+ u16 intercept_dr_read;
+ u16 intercept_dr_write;
u32 intercept_exceptions;
u64 intercept;
u8 reserved_1[42];
@@ -120,19 +121,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 event_inj_err;
u64 nested_cr3;
u64 lbr_ctl;
- u32 clean;
- u32 reserved_5;
- u64 next_rip;
- u8 insn_len;
- u8 insn_bytes[15];
- u8 reserved_6[800];
+ u8 reserved_5[832];
};
#define TLB_CONTROL_DO_NOTHING 0
#define TLB_CONTROL_FLUSH_ALL_ASID 1
-#define TLB_CONTROL_FLUSH_ASID 3
-#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
#define V_TPR_MASK 0x0f
@@ -161,10 +155,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
-#define SVM_VM_CR_VALID_MASK 0x001fULL
-#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
-#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
-
struct __attribute__ ((__packed__)) vmcb_seg {
u16 selector;
u16 attrib;
@@ -248,31 +238,19 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
#define SVM_SELECTOR_CODE_MASK (1 << 3)
-#define INTERCEPT_CR0_READ 0
-#define INTERCEPT_CR3_READ 3
-#define INTERCEPT_CR4_READ 4
-#define INTERCEPT_CR8_READ 8
-#define INTERCEPT_CR0_WRITE (16 + 0)
-#define INTERCEPT_CR3_WRITE (16 + 3)
-#define INTERCEPT_CR4_WRITE (16 + 4)
-#define INTERCEPT_CR8_WRITE (16 + 8)
-
-#define INTERCEPT_DR0_READ 0
-#define INTERCEPT_DR1_READ 1
-#define INTERCEPT_DR2_READ 2
-#define INTERCEPT_DR3_READ 3
-#define INTERCEPT_DR4_READ 4
-#define INTERCEPT_DR5_READ 5
-#define INTERCEPT_DR6_READ 6
-#define INTERCEPT_DR7_READ 7
-#define INTERCEPT_DR0_WRITE (16 + 0)
-#define INTERCEPT_DR1_WRITE (16 + 1)
-#define INTERCEPT_DR2_WRITE (16 + 2)
-#define INTERCEPT_DR3_WRITE (16 + 3)
-#define INTERCEPT_DR4_WRITE (16 + 4)
-#define INTERCEPT_DR5_WRITE (16 + 5)
-#define INTERCEPT_DR6_WRITE (16 + 6)
-#define INTERCEPT_DR7_WRITE (16 + 7)
+#define INTERCEPT_CR0_MASK 1
+#define INTERCEPT_CR3_MASK (1 << 3)
+#define INTERCEPT_CR4_MASK (1 << 4)
+#define INTERCEPT_CR8_MASK (1 << 8)
+
+#define INTERCEPT_DR0_MASK 1
+#define INTERCEPT_DR1_MASK (1 << 1)
+#define INTERCEPT_DR2_MASK (1 << 2)
+#define INTERCEPT_DR3_MASK (1 << 3)
+#define INTERCEPT_DR4_MASK (1 << 4)
+#define INTERCEPT_DR5_MASK (1 << 5)
+#define INTERCEPT_DR6_MASK (1 << 6)
+#define INTERCEPT_DR7_MASK (1 << 7)
#define SVM_EVTINJ_VEC_MASK 0xff
@@ -300,9 +278,6 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
-#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
-
-#define SVM_EXITINFO_REG_MASK 0x0F
#define SVM_EXIT_READ_CR0 0x000
#define SVM_EXIT_READ_CR3 0x003
@@ -374,7 +349,6 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXIT_MONITOR 0x08a
#define SVM_EXIT_MWAIT 0x08b
#define SVM_EXIT_MWAIT_COND 0x08c
-#define SVM_EXIT_XSETBV 0x08d
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_ERR -1
diff --git a/linux/include/asm-x86/vmx.h b/linux/include/asm-x86/vmx.h
index 0174fa5..cf2af50 100644
--- a/linux/include/asm-x86/vmx.h
+++ b/linux/include/asm-x86/vmx.h
@@ -65,8 +65,6 @@
*
*/
-#include <linux/types.h>
-
/*
* Definitions of Primary Processor-Based VM-Execution Controls.
*/
@@ -106,23 +104,15 @@
#define PIN_BASED_NMI_EXITING 0x00000008
#define PIN_BASED_VIRTUAL_NMIS 0x00000020
-#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
-#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
#define VM_EXIT_SAVE_IA32_PAT 0x00040000
#define VM_EXIT_LOAD_IA32_PAT 0x00080000
-#define VM_EXIT_SAVE_IA32_EFER 0x00100000
-#define VM_EXIT_LOAD_IA32_EFER 0x00200000
-#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
-#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
-#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
-#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
/* VMCS Encodings */
enum vmcs_field {
@@ -170,8 +160,6 @@ enum vmcs_field {
GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
GUEST_IA32_PAT = 0x00002804,
GUEST_IA32_PAT_HIGH = 0x00002805,
- GUEST_IA32_EFER = 0x00002806,
- GUEST_IA32_EFER_HIGH = 0x00002807,
GUEST_PDPTR0 = 0x0000280a,
GUEST_PDPTR0_HIGH = 0x0000280b,
GUEST_PDPTR1 = 0x0000280c,
@@ -182,8 +170,6 @@ enum vmcs_field {
GUEST_PDPTR3_HIGH = 0x00002811,
HOST_IA32_PAT = 0x00002c00,
HOST_IA32_PAT_HIGH = 0x00002c01,
- HOST_IA32_EFER = 0x00002c02,
- HOST_IA32_EFER_HIGH = 0x00002c03,
PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
EXCEPTION_BITMAP = 0x00004004,
@@ -287,7 +273,6 @@ enum vmcs_field {
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
#define EXIT_REASON_HLT 12
-#define EXIT_REASON_INVD 13
#define EXIT_REASON_INVLPG 14
#define EXIT_REASON_RDPMC 15
#define EXIT_REASON_RDTSC 16
@@ -306,7 +291,6 @@ enum vmcs_field {
#define EXIT_REASON_IO_INSTRUCTION 30
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
-#define EXIT_REASON_INVALID_STATE 33
#define EXIT_REASON_MWAIT_INSTRUCTION 36
#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -316,7 +300,6 @@ enum vmcs_field {
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_WBINVD 54
-#define EXIT_REASON_XSETBV 55
/*
* Interruption-information format
@@ -345,12 +328,6 @@ enum vmcs_field {
#define GUEST_INTR_STATE_SMI 0x00000004
#define GUEST_INTR_STATE_NMI 0x00000008
-/* GUEST_ACTIVITY_STATE flags */
-#define GUEST_ACTIVITY_ACTIVE 0
-#define GUEST_ACTIVITY_HLT 1
-#define GUEST_ACTIVITY_SHUTDOWN 2
-#define GUEST_ACTIVITY_WAIT_SIPI 3
-
/*
* Exit Qualifications for MOV for Control Register Access
*/
@@ -432,9 +409,6 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
-#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
-#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
-
#define VMX_EPT_DEFAULT_GAW 3
#define VMX_EPT_MAX_GAW 0x4
#define VMX_EPT_MT_EPTE_SHIFT 3
@@ -460,10 +434,6 @@ enum vmcs_field {
#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
-struct vmx_msr_entry {
- u32 index;
- u32 reserved;
- u64 value;
-} __aligned(16);
+
#endif
diff --git a/linux/include/linux/kvm.h b/linux/include/linux/kvm.h
index 88fccc0..416465c 100644
--- a/linux/include/linux/kvm.h
+++ b/linux/include/linux/kvm.h
@@ -200,7 +200,6 @@ struct kvm_pit_config {
#define KVM_EXIT_DCR 15
#define KVM_EXIT_NMI 16
#define KVM_EXIT_INTERNAL_ERROR 17
-#define KVM_EXIT_OSI 18
/* For KVM_EXIT_INTERNAL_ERROR */
#define KVM_INTERNAL_ERROR_EMULATION 1
@@ -300,10 +299,6 @@ struct kvm_run {
__u32 ndata;
__u64 data[16];
} internal;
- /* KVM_EXIT_OSI */
- struct {
- __u64 gprs[32];
- } osi;
/* Fix the size of the union. */
char padding[256];
};
@@ -445,23 +440,6 @@ struct kvm_ioeventfd {
__u8 pad[36];
};
-/* for KVM_ENABLE_CAP */
-struct kvm_enable_cap {
- /* in */
- __u32 cap;
- __u32 flags;
- __u64 args[4];
- __u8 pad[64];
-};
-
-/* for KVM_PPC_GET_PVINFO */
-struct kvm_ppc_pvinfo {
- /* out */
- __u32 flags;
- __u32 hcall[4];
- __u8 pad[108];
-};
-
#define KVMIO 0xAE
/*
@@ -563,24 +541,7 @@ struct kvm_ppc_pvinfo {
#define KVM_CAP_HYPERV_VAPIC 45
#define KVM_CAP_HYPERV_SPIN 46
#define KVM_CAP_PCI_SEGMENT 47
-#define KVM_CAP_PPC_PAIRED_SINGLES 48
-#define KVM_CAP_INTR_SHADOW 49
-#ifdef __KVM_HAVE_DEBUGREGS
-#define KVM_CAP_DEBUGREGS 50
-#endif
#define KVM_CAP_X86_ROBUST_SINGLESTEP 51
-#define KVM_CAP_PPC_OSI 52
-#define KVM_CAP_PPC_UNSET_IRQ 53
-#define KVM_CAP_ENABLE_CAP 54
-#ifdef __KVM_HAVE_XSAVE
-#define KVM_CAP_XSAVE 55
-#endif
-#ifdef __KVM_HAVE_XCRS
-#define KVM_CAP_XCRS 56
-#endif
-#define KVM_CAP_PPC_GET_PVINFO 57
-#define KVM_CAP_PPC_IRQ_LEVEL 58
-#define KVM_CAP_ASYNC_PF 59
#ifdef KVM_CAP_IRQ_ROUTING
@@ -670,7 +631,6 @@ struct kvm_clock_data {
*/
#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
-/* KVM_SET_MEMORY_ALIAS is obsolete: */
#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
@@ -715,8 +675,6 @@ struct kvm_clock_data {
/* Available with KVM_CAP_PIT_STATE2 */
#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2)
#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
-/* Available with KVM_CAP_PPC_GET_PVINFO */
-#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo)
/*
* ioctls for vcpu fds
@@ -770,16 +728,6 @@ struct kvm_clock_data {
/* Available with KVM_CAP_VCPU_EVENTS */
#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events)
#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events)
-/* Available with KVM_CAP_DEBUGREGS */
-#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs)
-#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs)
-#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap)
-/* Available with KVM_CAP_XSAVE */
-#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave)
-#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave)
-/* Available with KVM_CAP_XCRS */
-#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs)
-#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
diff --git a/linux/include/linux/kvm_host.h b/linux/include/linux/kvm_host.h
index 84546bd..14f3920 100644
--- a/linux/include/linux/kvm_host.h
+++ b/linux/include/linux/kvm_host.h
@@ -56,8 +56,6 @@
#include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/msi.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
#include <asm/signal.h>
#include <linux/kvm.h>
@@ -78,11 +76,9 @@
#define KVM_REQ_PENDING_TIMER 5
#define KVM_REQ_UNHALT 6
#define KVM_REQ_MMU_SYNC 7
-#define KVM_REQ_CLOCK_UPDATE 8
+#define KVM_REQ_KVMCLOCK_UPDATE 8
#define KVM_REQ_KICK 9
#define KVM_REQ_DEACTIVATE_FPU 10
-#define KVM_REQ_EVENT 11
-#define KVM_REQ_APF_HALT 12
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
@@ -117,27 +113,6 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev);
-#ifdef CONFIG_KVM_ASYNC_PF
-struct kvm_async_pf {
- struct work_struct work;
- struct list_head link;
- struct list_head queue;
- struct kvm_vcpu *vcpu;
- struct mm_struct *mm;
- gva_t gva;
- unsigned long addr;
- struct kvm_arch_async_pf arch;
- struct page *page;
- bool done;
-};
-
-void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
-void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- struct kvm_arch_async_pf *arch);
-int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
-#endif
-
struct kvm_vcpu {
struct kvm *kvm;
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -146,14 +121,13 @@ struct kvm_vcpu {
int vcpu_id;
struct mutex mutex;
int cpu;
- atomic_t guest_mode;
struct kvm_run *run;
unsigned long requests;
unsigned long guest_debug;
int srcu_idx;
int fpu_active;
- int guest_fpu_loaded, guest_xcr0_loaded;
+ int guest_fpu_loaded;
wait_queue_head_t wq;
int sigset_active;
sigset_t sigset;
@@ -168,40 +142,21 @@ struct kvm_vcpu {
gpa_t mmio_phys_addr;
#endif
-#ifdef CONFIG_KVM_ASYNC_PF
- struct {
- u32 queued;
- struct list_head queue;
- struct list_head done;
- spinlock_t lock;
- } async_pf;
-#endif
-
struct kvm_vcpu_arch arch;
};
-/*
- * Some of the bitops functions do not support too long bitmaps.
- * This number must be determined not to exceed such limits.
- */
-#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
-
-struct kvm_lpage_info {
- unsigned long rmap_pde;
- int write_count;
-};
-
struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
unsigned long flags;
unsigned long *rmap;
unsigned long *dirty_bitmap;
- unsigned long *dirty_bitmap_head;
- struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+ struct {
+ unsigned long rmap_pde;
+ int write_count;
+ } *lpage_info[KVM_NR_PAGE_SIZES - 1];
unsigned long userspace_addr;
int user_alloc;
- int id;
};
static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
@@ -245,7 +200,6 @@ struct kvm_irq_routing_table {};
struct kvm_memslots {
int nmemslots;
- u64 generation;
struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
KVM_PRIVATE_MEM_SLOTS];
};
@@ -283,11 +237,7 @@ struct kvm {
struct mutex irq_lock;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
- /*
- * Update side is protected by irq_lock and,
- * if configured, irqfds.lock.
- */
- struct kvm_irq_routing_table __rcu *irq_routing;
+ struct kvm_irq_routing_table *irq_routing;
struct hlist_head mask_notifier_list;
struct hlist_head irq_ack_notifier_list;
#endif
@@ -297,7 +247,6 @@ struct kvm {
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
#endif
- long tlbs_dirty;
};
/* The guest did something we don't support. */
@@ -328,31 +277,23 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
void vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
-int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+int kvm_init(void *opaque, unsigned int vcpu_size,
struct module *module);
void kvm_exit(void);
void kvm_get_kvm(struct kvm *kvm);
void kvm_put_kvm(struct kvm *kvm);
-static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
-{
- return rcu_dereference_check(kvm->memslots,
- srcu_read_lock_held(&kvm->srcu)
- || lockdep_is_held(&kvm->slots_lock));
-}
-
#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
extern struct page *bad_page;
extern pfn_t bad_pfn;
int is_error_page(struct page *page);
int is_error_pfn(pfn_t pfn);
-int is_hwpoison_pfn(pfn_t pfn);
-int is_fault_pfn(pfn_t pfn);
int kvm_is_error_hva(unsigned long addr);
int kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
@@ -371,9 +312,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
int user_alloc);
void kvm_disable_largepages(void);
void kvm_arch_flush_shadow(struct kvm *kvm);
-
-int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
- int nr_pages);
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
@@ -382,13 +322,7 @@ void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
-pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
- bool write_fault, bool *writable);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
- bool *writable);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
int memslot_id(struct kvm *kvm, gfn_t gfn);
@@ -407,25 +341,18 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
int offset, int len);
int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
unsigned long len);
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- void *data, unsigned long len);
-int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- gpa_t gpa);
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
-void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
- gfn_t gfn);
void kvm_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
void kvm_resched(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
-
void kvm_flush_remote_tlbs(struct kvm *kvm);
void kvm_reload_remote_mmus(struct kvm *kvm);
@@ -491,19 +418,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
void kvm_free_physmem(struct kvm *kvm);
-#ifndef __KVM_HAVE_ARCH_VM_ALLOC
-static inline struct kvm *kvm_arch_alloc_vm(void)
-{
- return kzalloc(sizeof(struct kvm), GFP_KERNEL);
-}
-
-static inline void kvm_arch_free_vm(struct kvm *kvm)
-{
- kfree(kvm);
-}
-#endif
-
-int kvm_arch_init_vm(struct kvm *kvm);
+struct kvm *kvm_arch_create_vm(void);
void kvm_arch_destroy_vm(struct kvm *kvm);
void kvm_free_all_assigned_devices(struct kvm *kvm);
void kvm_arch_sync_events(struct kvm *kvm);
@@ -519,8 +434,16 @@ struct kvm_irq_ack_notifier {
void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
};
+#define KVM_ASSIGNED_MSIX_PENDING 0x1
+struct kvm_guest_msix_entry {
+ u32 vector;
+ u16 entry;
+ u16 flags;
+};
+
struct kvm_assigned_dev_kernel {
struct kvm_irq_ack_notifier ack_notifier;
+ struct work_struct interrupt_work;
struct list_head list;
int assigned_dev_id;
int host_segnr;
@@ -531,14 +454,13 @@ struct kvm_assigned_dev_kernel {
bool host_irq_disabled;
struct msix_entry *host_msix_entries;
int guest_irq;
- struct msix_entry *guest_msix_entries;
+ struct kvm_guest_msix_entry *guest_msix_entries;
unsigned long irq_requested_type;
int irq_source_id;
int flags;
struct pci_dev *dev;
struct kvm *kvm;
- spinlock_t intx_lock;
- char irq_name[32];
+ spinlock_t assigned_dev_lock;
};
struct kvm_irq_mask_notifier {
@@ -551,8 +473,7 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn);
void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn);
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask);
+void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
#ifdef __KVM_HAVE_IOAPIC
void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
@@ -560,8 +481,6 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
unsigned long *deliver_bitmask);
#endif
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
- int irq_source_id, int level);
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
void kvm_register_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
@@ -583,7 +502,8 @@ int kvm_deassign_device(struct kvm *kvm,
struct kvm_assigned_dev_kernel *assigned_dev);
#else /* CONFIG_IOMMU_API */
static inline int kvm_iommu_map_pages(struct kvm *kvm,
- struct kvm_memory_slot *slot)
+ gfn_t base_gfn,
+ unsigned long npages)
{
return 0;
}
@@ -623,22 +543,11 @@ static inline void kvm_guest_exit(void)
current->flags &= ~PF_VCPU;
}
-static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
- gfn_t gfn)
-{
- return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
-}
-
static inline gpa_t gfn_to_gpa(gfn_t gfn)
{
return (gpa_t)gfn << PAGE_SHIFT;
}
-static inline gfn_t gpa_to_gfn(gpa_t gpa)
-{
- return (gfn_t)(gpa >> PAGE_SHIFT);
-}
-
static inline hpa_t pfn_to_hpa(pfn_t pfn)
{
return (hpa_t)pfn << PAGE_SHIFT;
@@ -681,6 +590,10 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
}
#endif
+#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+#define unalias_gfn_instantiation unalias_gfn
+#endif
+
#ifdef CONFIG_HAVE_KVM_IRQCHIP
#define KVM_MAX_IRQ_ROUTES 1024
@@ -703,28 +616,17 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
void kvm_eventfd_init(struct kvm *kvm);
int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
void kvm_irqfd_release(struct kvm *kvm);
-void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
#else
static inline void kvm_eventfd_init(struct kvm *kvm) {}
-
static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
{
return -EINVAL;
}
static inline void kvm_irqfd_release(struct kvm *kvm) {}
-
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
-static inline void kvm_irq_routing_update(struct kvm *kvm,
- struct kvm_irq_routing_table *irq_rt)
-{
- rcu_assign_pointer(kvm->irq_routing, irq_rt);
-}
-#endif
-
static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
return -ENOSYS;
@@ -754,25 +656,5 @@ static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
#endif
-static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
-{
- set_bit(req, &vcpu->requests);
-}
-
-static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
-{
- return test_and_set_bit(req, &vcpu->requests);
-}
-
-static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
-{
- if (test_bit(req, &vcpu->requests)) {
- clear_bit(req, &vcpu->requests);
- return true;
- } else {
- return false;
- }
-}
-
#endif
diff --git a/linux/include/linux/kvm_para.h b/linux/include/linux/kvm_para.h
index e13e53d..921be06 100644
--- a/linux/include/linux/kvm_para.h
+++ b/linux/include/linux/kvm_para.h
@@ -57,8 +57,6 @@
#define KVM_HC_VAPIC_POLL_IRQ 1
#define KVM_HC_MMU_OP 2
-#define KVM_HC_FEATURES 3
-#define KVM_HC_PPC_MAP_MAGIC_PAGE 4
/*
* hypercalls use architecture specific
@@ -66,6 +64,11 @@
#include <asm/kvm_para.h>
#ifdef __KERNEL__
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
+#endif
static inline int kvm_para_has_feature(unsigned int feature)
{
diff --git a/linux/include/linux/kvm_types.h b/linux/include/linux/kvm_types.h
index e35297a..c65f89e 100644
--- a/linux/include/linux/kvm_types.h
+++ b/linux/include/linux/kvm_types.h
@@ -72,11 +72,11 @@
typedef unsigned long gva_t;
typedef u64 gpa_t;
-typedef u64 gfn_t;
+typedef unsigned long gfn_t;
typedef unsigned long hva_t;
typedef u64 hpa_t;
-typedef u64 hfn_t;
+typedef unsigned long hfn_t;
typedef hfn_t pfn_t;
@@ -107,11 +107,4 @@ struct kvm_lapic_irq {
u32 dest_id;
};
-struct gfn_to_hva_cache {
- u64 generation;
- gpa_t gpa;
- unsigned long hva;
- struct kvm_memory_slot *memslot;
-};
-
#endif /* __KVM_TYPES_H__ */
diff --git a/linux/include/trace/events/kvm.h b/linux/include/trace/events/kvm.h
index 1c1c1f7..35d92f2 100644
--- a/linux/include/trace/events/kvm.h
+++ b/linux/include/trace/events/kvm.h
@@ -45,36 +45,7 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
-
-#define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x }
-
-#define kvm_trace_exit_reason \
- ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \
- ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \
- ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \
- ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
- ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI)
-
-TRACE_EVENT(kvm_userspace_exit,
- TP_PROTO(__u32 reason, int errno),
- TP_ARGS(reason, errno),
-
- TP_STRUCT__entry(
- __field( __u32, reason )
- __field( int, errno )
- ),
-
- TP_fast_assign(
- __entry->reason = reason;
- __entry->errno = errno;
- ),
-
- TP_printk("reason %s (%d)",
- __entry->errno < 0 ?
- (__entry->errno == -EINTR ? "restart" : "error") :
- __print_symbolic(__entry->reason, kvm_trace_exit_reason),
- __entry->errno < 0 ? -__entry->errno : __entry->reason)
-);
+#define TRACE_INCLUDE_FILE kvm
#if defined(__KVM_HAVE_IOAPIC)
TRACE_EVENT(kvm_set_irq,
@@ -255,97 +226,6 @@ TRACE_EVENT(kvm_age_page,
__entry->referenced ? "YOUNG" : "OLD")
);
-#ifdef CONFIG_KVM_ASYNC_PF
-DECLARE_EVENT_CLASS(kvm_async_get_page_class,
-
- TP_PROTO(u64 gva, u64 gfn),
-
- TP_ARGS(gva, gfn),
-
- TP_STRUCT__entry(
- __field(__u64, gva)
- __field(u64, gfn)
- ),
-
- TP_fast_assign(
- __entry->gva = gva;
- __entry->gfn = gfn;
- ),
-
- TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
-);
-
-DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page,
-
- TP_PROTO(u64 gva, u64 gfn),
-
- TP_ARGS(gva, gfn)
-);
-
-DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault,
-
- TP_PROTO(u64 gva, u64 gfn),
-
- TP_ARGS(gva, gfn)
-);
-
-DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready,
-
- TP_PROTO(u64 token, u64 gva),
-
- TP_ARGS(token, gva),
-
- TP_STRUCT__entry(
- __field(__u64, token)
- __field(__u64, gva)
- ),
-
- TP_fast_assign(
- __entry->token = token;
- __entry->gva = gva;
- ),
-
- TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva)
-
-);
-
-DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present,
-
- TP_PROTO(u64 token, u64 gva),
-
- TP_ARGS(token, gva)
-);
-
-DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready,
-
- TP_PROTO(u64 token, u64 gva),
-
- TP_ARGS(token, gva)
-);
-
-TRACE_EVENT(
- kvm_async_pf_completed,
- TP_PROTO(unsigned long address, struct page *page, u64 gva),
- TP_ARGS(address, page, gva),
-
- TP_STRUCT__entry(
- __field(unsigned long, address)
- __field(pfn_t, pfn)
- __field(u64, gva)
- ),
-
- TP_fast_assign(
- __entry->address = address;
- __entry->pfn = page ? page_to_pfn(page) : 0;
- __entry->gva = gva;
- ),
-
- TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva,
- __entry->address, __entry->pfn)
-);
-
-#endif
-
#endif /* _TRACE_KVM_MAIN_H */
/* This part must be outside protection */
diff --git a/linux/usr/include/asm-x86/hyperv.h b/linux/usr/include/asm-x86/hyperv.h
index 5df477a..e153a2b 100644
--- a/linux/usr/include/asm-x86/hyperv.h
+++ b/linux/usr/include/asm-x86/hyperv.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_HYPERV_H
-#define _ASM_X86_HYPERV_H
+#ifndef _ASM_X86_KVM_HYPERV_H
+#define _ASM_X86_KVM_HYPERV_H
#include <linux/types.h>
@@ -14,10 +14,6 @@
#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
-#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
-#define HYPERV_CPUID_MIN 0x40000005
-#define HYPERV_CPUID_MAX 0x4000ffff
-
/*
* Feature identification. EAX indicates which features are available
* to the partition based upon the current partition privileges.
@@ -133,9 +129,6 @@
/* MSR used to provide vcpu index */
#define HV_X64_MSR_VP_INDEX 0x40000002
-/* MSR used to read the per-partition time reference counter */
-#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
-
/* Define the virtual APIC registers */
#define HV_X64_MSR_EOI 0x40000070
#define HV_X64_MSR_ICR 0x40000071
diff --git a/linux/usr/include/asm-x86/kvm.h b/linux/usr/include/asm-x86/kvm.h
index 4d8dcbd..f46b79f 100644
--- a/linux/usr/include/asm-x86/kvm.h
+++ b/linux/usr/include/asm-x86/kvm.h
@@ -21,9 +21,6 @@
#define __KVM_HAVE_PIT_STATE2
#define __KVM_HAVE_XEN_HVM
#define __KVM_HAVE_VCPU_EVENTS
-#define __KVM_HAVE_DEBUGREGS
-#define __KVM_HAVE_XSAVE
-#define __KVM_HAVE_XCRS
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -260,11 +257,6 @@ struct kvm_reinject_control {
/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
-#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
-
-/* Interrupt shadow states */
-#define KVM_X86_SHADOW_INT_MOV_SS 0x01
-#define KVM_X86_SHADOW_INT_STI 0x02
/* for KVM_GET/SET_VCPU_EVENTS */
struct kvm_vcpu_events {
@@ -279,7 +271,7 @@ struct kvm_vcpu_events {
__u8 injected;
__u8 nr;
__u8 soft;
- __u8 shadow;
+ __u8 pad;
} interrupt;
struct {
__u8 injected;
@@ -292,33 +284,4 @@ struct kvm_vcpu_events {
__u32 reserved[10];
};
-/* for KVM_GET/SET_DEBUGREGS */
-struct kvm_debugregs {
- __u64 db[4];
- __u64 dr6;
- __u64 dr7;
- __u64 flags;
- __u64 reserved[9];
-};
-
-/* for KVM_CAP_XSAVE */
-struct kvm_xsave {
- __u32 region[1024];
-};
-
-#define KVM_MAX_XCRS 16
-
-struct kvm_xcr {
- __u32 xcr;
- __u32 reserved;
- __u64 value;
-};
-
-struct kvm_xcrs {
- __u32 nr_xcrs;
- __u32 flags;
- struct kvm_xcr xcrs[KVM_MAX_XCRS];
- __u64 padding[16];
-};
-
#endif /* _ASM_X86_KVM_H */
diff --git a/linux/usr/include/asm-x86/kvm_para.h b/linux/usr/include/asm-x86/kvm_para.h
index 834d71e..d91d4fa 100644
--- a/linux/usr/include/asm-x86/kvm_para.h
+++ b/linux/usr/include/asm-x86/kvm_para.h
@@ -16,30 +16,12 @@
#define KVM_FEATURE_CLOCKSOURCE 0
#define KVM_FEATURE_NOP_IO_DELAY 1
#define KVM_FEATURE_MMU_OP 2
-/* This indicates that the new set of kvmclock msrs
- * are available. The use of 0x11 and 0x12 is deprecated
- */
-#define KVM_FEATURE_CLOCKSOURCE2 3
-#define KVM_FEATURE_ASYNC_PF 4
-
-/* The last 8 bits are used to indicate how to interpret the flags field
- * in pvclock structure. If no bits are set, all flags are ignored.
- */
-#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
-/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
-#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
-#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
-#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
-
#define KVM_MAX_MMU_OP_BATCH 32
-#define KVM_ASYNC_PF_ENABLED (1 << 0)
-#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
-
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
#define KVM_MMU_OP_FLUSH_TLB 2
@@ -66,14 +48,5 @@ struct kvm_mmu_op_release_pt {
__u64 pt_phys;
};
-#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
-#define KVM_PV_REASON_PAGE_READY 2
-
-struct kvm_vcpu_pv_apf_data {
- __u32 reason;
- __u8 pad[60];
- __u32 enabled;
-};
-
#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/linux/usr/include/linux/kvm.h b/linux/usr/include/linux/kvm.h
index 27efcfb..fcebd4c 100644
--- a/linux/usr/include/linux/kvm.h
+++ b/linux/usr/include/linux/kvm.h
@@ -160,7 +160,6 @@ struct kvm_pit_config {
#define KVM_EXIT_DCR 15
#define KVM_EXIT_NMI 16
#define KVM_EXIT_INTERNAL_ERROR 17
-#define KVM_EXIT_OSI 18
/* For KVM_EXIT_INTERNAL_ERROR */
#define KVM_INTERNAL_ERROR_EMULATION 1
@@ -260,10 +259,6 @@ struct kvm_run {
__u32 ndata;
__u64 data[16];
} internal;
- /* KVM_EXIT_OSI */
- struct {
- __u64 gprs[32];
- } osi;
/* Fix the size of the union. */
char padding[256];
};
@@ -405,23 +400,6 @@ struct kvm_ioeventfd {
__u8 pad[36];
};
-/* for KVM_ENABLE_CAP */
-struct kvm_enable_cap {
- /* in */
- __u32 cap;
- __u32 flags;
- __u64 args[4];
- __u8 pad[64];
-};
-
-/* for KVM_PPC_GET_PVINFO */
-struct kvm_ppc_pvinfo {
- /* out */
- __u32 flags;
- __u32 hcall[4];
- __u8 pad[108];
-};
-
#define KVMIO 0xAE
/*
@@ -523,24 +501,7 @@ struct kvm_ppc_pvinfo {
#define KVM_CAP_HYPERV_VAPIC 45
#define KVM_CAP_HYPERV_SPIN 46
#define KVM_CAP_PCI_SEGMENT 47
-#define KVM_CAP_PPC_PAIRED_SINGLES 48
-#define KVM_CAP_INTR_SHADOW 49
-#ifdef __KVM_HAVE_DEBUGREGS
-#define KVM_CAP_DEBUGREGS 50
-#endif
#define KVM_CAP_X86_ROBUST_SINGLESTEP 51
-#define KVM_CAP_PPC_OSI 52
-#define KVM_CAP_PPC_UNSET_IRQ 53
-#define KVM_CAP_ENABLE_CAP 54
-#ifdef __KVM_HAVE_XSAVE
-#define KVM_CAP_XSAVE 55
-#endif
-#ifdef __KVM_HAVE_XCRS
-#define KVM_CAP_XCRS 56
-#endif
-#define KVM_CAP_PPC_GET_PVINFO 57
-#define KVM_CAP_PPC_IRQ_LEVEL 58
-#define KVM_CAP_ASYNC_PF 59
#ifdef KVM_CAP_IRQ_ROUTING
@@ -630,7 +591,6 @@ struct kvm_clock_data {
*/
#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
-/* KVM_SET_MEMORY_ALIAS is obsolete: */
#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
@@ -675,8 +635,6 @@ struct kvm_clock_data {
/* Available with KVM_CAP_PIT_STATE2 */
#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2)
#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
-/* Available with KVM_CAP_PPC_GET_PVINFO */
-#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo)
/*
* ioctls for vcpu fds
@@ -730,16 +688,6 @@ struct kvm_clock_data {
/* Available with KVM_CAP_VCPU_EVENTS */
#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events)
#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events)
-/* Available with KVM_CAP_DEBUGREGS */
-#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs)
-#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs)
-#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap)
-/* Available with KVM_CAP_XSAVE */
-#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave)
-#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave)
-/* Available with KVM_CAP_XCRS */
-#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs)
-#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
diff --git a/linux/usr/include/linux/kvm_para.h b/linux/usr/include/linux/kvm_para.h
index b315e27..eca8259 100644
--- a/linux/usr/include/linux/kvm_para.h
+++ b/linux/usr/include/linux/kvm_para.h
@@ -17,8 +17,6 @@
#define KVM_HC_VAPIC_POLL_IRQ 1
#define KVM_HC_MMU_OP 2
-#define KVM_HC_FEATURES 3
-#define KVM_HC_PPC_MAP_MAGIC_PAGE 4
/*
* hypercalls use architecture specific
diff --git a/linux/x86/assigned-dev.c b/linux/x86/assigned-dev.c
index 86dac05..b383411 100644
--- a/linux/x86/assigned-dev.c
+++ b/linux/x86/assigned-dev.c
@@ -41,7 +41,7 @@
/*
* Kernel-based Virtual Machine - device assignment support
*
- * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2006-9 Red Hat, Inc
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
@@ -95,31 +95,60 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
return index;
}
-static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
{
- struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
- u32 vector;
- int index;
+ struct kvm_assigned_dev_kernel *assigned_dev;
+ struct kvm *kvm;
+ int i;
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
- spin_lock(&assigned_dev->intx_lock);
- disable_irq_nosync(irq);
- assigned_dev->host_irq_disabled = true;
- spin_unlock(&assigned_dev->intx_lock);
- }
+ assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
+ interrupt_work);
+ kvm = assigned_dev->kvm;
+ spin_lock_irq(&assigned_dev->assigned_dev_lock);
if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
- index = find_index_from_host_irq(assigned_dev, irq);
- if (index >= 0) {
- vector = assigned_dev->
- guest_msix_entries[index].vector;
+ struct kvm_guest_msix_entry *guest_entries =
+ assigned_dev->guest_msix_entries;
+ for (i = 0; i < assigned_dev->entries_nr; i++) {
+ if (!(guest_entries[i].flags &
+ KVM_ASSIGNED_MSIX_PENDING))
+ continue;
+ guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
kvm_set_irq(assigned_dev->kvm,
- assigned_dev->irq_source_id, vector, 1);
+ assigned_dev->irq_source_id,
+ guest_entries[i].vector, 1);
}
} else
kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
assigned_dev->guest_irq, 1);
+ spin_unlock_irq(&assigned_dev->assigned_dev_lock);
+}
+
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+ unsigned long flags;
+ struct kvm_assigned_dev_kernel *assigned_dev =
+ (struct kvm_assigned_dev_kernel *) dev_id;
+
+ spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+ int index = find_index_from_host_irq(assigned_dev, irq);
+ if (index < 0)
+ goto out;
+ assigned_dev->guest_msix_entries[index].flags |=
+ KVM_ASSIGNED_MSIX_PENDING;
+ }
+
+ schedule_work(&assigned_dev->interrupt_work);
+
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+ disable_irq_nosync(irq);
+ assigned_dev->host_irq_disabled = true;
+ }
+
+out:
+ spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
return IRQ_HANDLED;
}
@@ -127,6 +156,7 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_assigned_dev_kernel *dev;
+ unsigned long flags;
if (kian->gsi == -1)
return;
@@ -139,12 +169,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
/* The guest irq may be shared so this ack may be
* from another device.
*/
- spin_lock(&dev->intx_lock);
+ spin_lock_irqsave(&dev->assigned_dev_lock, flags);
if (dev->host_irq_disabled) {
enable_irq(dev->host_irq);
dev->host_irq_disabled = false;
}
- spin_unlock(&dev->intx_lock);
+ spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
}
static void deassign_guest_irq(struct kvm *kvm,
@@ -153,9 +183,6 @@ static void deassign_guest_irq(struct kvm *kvm,
kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
assigned_dev->ack_notifier.gsi = -1;
- kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
- assigned_dev->guest_irq, 0);
-
if (assigned_dev->irq_source_id != -1)
kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
assigned_dev->irq_source_id = -1;
@@ -167,19 +194,28 @@ static void deassign_host_irq(struct kvm *kvm,
struct kvm_assigned_dev_kernel *assigned_dev)
{
/*
- * We disable irq here to prevent further events.
+ * In kvm_free_device_irq, cancel_work_sync return true if:
+ * 1. work is scheduled, and then cancelled.
+ * 2. work callback is executed.
+ *
+ * The first one ensured that the irq is disabled and no more events
+ * would happen. But for the second one, the irq may be enabled (e.g.
+ * for MSI). So we disable irq here to prevent further events.
*
* Notice this maybe result in nested disable if the interrupt type is
* INTx, but it's OK for we are going to free it.
*
* If this function is a part of VM destroy, please ensure that till
* now, the kvm state is still legal for probably we also have to wait
- * on a currently running IRQ handler.
+ * interrupt_work done.
*/
if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
int i;
for (i = 0; i < assigned_dev->entries_nr; i++)
- disable_irq(assigned_dev->host_msix_entries[i].vector);
+ disable_irq_nosync(assigned_dev->
+ host_msix_entries[i].vector);
+
+ cancel_work_sync(&assigned_dev->interrupt_work);
for (i = 0; i < assigned_dev->entries_nr; i++)
free_irq(assigned_dev->host_msix_entries[i].vector,
@@ -191,7 +227,8 @@ static void deassign_host_irq(struct kvm *kvm,
pci_disable_msix(assigned_dev->dev);
} else {
/* Deal with MSI and INTx */
- disable_irq(assigned_dev->host_irq);
+ disable_irq_nosync(assigned_dev->host_irq);
+ cancel_work_sync(&assigned_dev->interrupt_work);
free_irq(assigned_dev->host_irq, (void *)assigned_dev);
@@ -237,8 +274,7 @@ static void kvm_free_assigned_device(struct kvm *kvm,
{
kvm_free_assigned_irq(kvm, assigned_dev);
- __pci_reset_function(assigned_dev->dev);
- pci_restore_state(assigned_dev->dev);
+ pci_reset_function(assigned_dev->dev);
pci_release_regions(assigned_dev->dev);
pci_disable_device(assigned_dev->dev);
@@ -271,8 +307,8 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
* on the same interrupt line is not a happy situation: there
* are going to be long delays in accepting, acking, etc.
*/
- if (kvm_request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
- IRQF_ONESHOT, dev->irq_name, (void *)dev))
+ if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
+ 0, "kvm_assigned_intx_device", (void *)dev))
return -EIO;
return 0;
}
@@ -290,8 +326,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
}
dev->host_irq = dev->dev->irq;
- if (kvm_request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
- 0, dev->irq_name, (void *)dev)) {
+ if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
+ "kvm_assigned_msi_device", (void *)dev)) {
pci_disable_msi(dev->dev);
return -EIO;
}
@@ -316,19 +352,16 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
return r;
for (i = 0; i < dev->entries_nr; i++) {
- r = kvm_request_threaded_irq(dev->host_msix_entries[i].vector,
- NULL, kvm_assigned_dev_thread,
- 0, dev->irq_name, (void *)dev);
+ r = request_irq(dev->host_msix_entries[i].vector,
+ kvm_assigned_dev_intr, 0,
+ "kvm_assigned_msix_device",
+ (void *)dev);
+ /* FIXME: free requested_irq's on failure */
if (r)
- goto err;
+ return r;
}
return 0;
-err:
- for (i -= 1; i >= 0; i--)
- free_irq(dev->host_msix_entries[i].vector, (void *)dev);
- pci_disable_msix(dev->dev);
- return r;
}
#endif
@@ -375,9 +408,6 @@ static int assign_host_irq(struct kvm *kvm,
if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
return r;
- snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
- pci_name(dev->dev));
-
switch (host_irq_type) {
case KVM_DEV_IRQ_HOST_INTX:
r = assigned_device_enable_host_intx(kvm, dev);
@@ -454,6 +484,9 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
struct kvm_assigned_dev_kernel *match;
unsigned long host_irq_type, guest_irq_type;
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
if (!irqchip_in_kernel(kvm))
return r;
@@ -555,7 +588,6 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
}
pci_reset_function(dev);
- pci_save_state(dev);
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_segnr = assigned_dev->segnr;
@@ -563,10 +595,12 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
match->host_devfn = assigned_dev->devfn;
match->flags = assigned_dev->flags;
match->dev = dev;
- spin_lock_init(&match->intx_lock);
+ spin_lock_init(&match->assigned_dev_lock);
match->irq_source_id = -1;
match->kvm = kvm;
match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+ INIT_WORK(&match->interrupt_work,
+ kvm_assigned_dev_interrupt_work_handler);
list_add(&match->list, &kvm->arch.assigned_dev_head);
@@ -586,7 +620,6 @@ out:
mutex_unlock(&kvm->lock);
return r;
out_list_del:
- pci_restore_state(dev);
list_del(&match->list);
pci_release_regions(dev);
out_disable:
@@ -659,9 +692,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
r = -ENOMEM;
goto msix_nr_out;
}
- adev->guest_msix_entries =
- kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
- GFP_KERNEL);
+ adev->guest_msix_entries = kzalloc(
+ sizeof(struct kvm_guest_msix_entry) *
+ entry_nr->entry_nr, GFP_KERNEL);
if (!adev->guest_msix_entries) {
kfree(adev->host_msix_entries);
r = -ENOMEM;
@@ -713,8 +746,8 @@ msix_entry_out:
long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
unsigned long arg)
{
- void *argp = (void *)arg;
- int r;
+ void __user *argp = (void __user *)arg;
+ int r = -ENOTTY;
switch (ioctl) {
case KVM_ASSIGN_PCI_DEVICE: {
@@ -732,6 +765,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
r = -EOPNOTSUPP;
break;
}
+#ifdef KVM_CAP_ASSIGN_DEV_IRQ
case KVM_ASSIGN_DEV_IRQ: {
struct kvm_assigned_irq assigned_irq;
@@ -754,6 +788,8 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
+#endif
+#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
case KVM_DEASSIGN_PCI_DEVICE: {
struct kvm_assigned_pci_dev assigned_dev;
@@ -765,10 +801,11 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
+#endif
#ifdef KVM_CAP_IRQ_ROUTING
case KVM_SET_GSI_ROUTING: {
struct kvm_irq_routing routing;
- struct kvm_irq_routing *urouting;
+ struct kvm_irq_routing __user *urouting;
struct kvm_irq_routing_entry *entries;
r = -EFAULT;
@@ -817,9 +854,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
break;
}
#endif
- default:
- r = -ENOTTY;
- break;
}
out:
return r;
diff --git a/linux/x86/coalesced_mmio.c b/linux/x86/coalesced_mmio.c
index 850773e..d84dadf 100644
--- a/linux/x86/coalesced_mmio.c
+++ b/linux/x86/coalesced_mmio.c
@@ -42,7 +42,6 @@
* KVM coalesced MMIO
*
* Copyright (c) 2008 Bull S.A.S.
- * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Author: Laurent Vivier <Laurent.Vivier@bull.net>
*
@@ -161,10 +160,8 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
return ret;
out_free_dev:
- kvm->coalesced_mmio_dev = NULL;
kfree(dev);
out_free_page:
- kvm->coalesced_mmio_ring = NULL;
__free_page(page);
out_err:
return ret;
@@ -182,7 +179,7 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
if (dev == NULL)
- return -ENXIO;
+ return -EINVAL;
mutex_lock(&kvm->slots_lock);
if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
@@ -205,7 +202,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
struct kvm_coalesced_mmio_zone *z;
if (dev == NULL)
- return -ENXIO;
+ return -EINVAL;
mutex_lock(&kvm->slots_lock);
diff --git a/linux/x86/emulate.c b/linux/x86/emulate.c
index cffdf3c..48e50c7 100644
--- a/linux/x86/emulate.c
+++ b/linux/x86/emulate.c
@@ -49,7 +49,6 @@
* privileged instructions:
*
* Copyright (C) 2006 Qumranet
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
@@ -60,13 +59,20 @@
* From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
*/
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <stdint.h>
+#include <public/xen.h>
+#define DPRINTF(_f, _a ...) printf(_f , ## _a)
+#else
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
+#define DPRINTF(x...) do {} while (0)
+#endif
#include <linux/module.h>
#include <asm/kvm_emulate.h>
#include "x86.h"
-#include "tss.h"
/*
* Opcode effective-address decode tables.
@@ -83,13 +89,11 @@
#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
#define DstReg (2<<1) /* Register operand. */
#define DstMem (3<<1) /* Memory operand. */
-#define DstAcc (4<<1) /* Destination Accumulator */
-#define DstDI (5<<1) /* Destination is in ES:(E)DI */
-#define DstMem64 (6<<1) /* 64bit memory operand */
-#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
+#define DstAcc (4<<1) /* Destination Accumulator */
#define DstMask (7<<1)
/* Source operand type. */
#define SrcNone (0<<4) /* No source operand. */
+#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
#define SrcReg (1<<4) /* Register operand. */
#define SrcMem (2<<4) /* Memory operand. */
#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
@@ -99,11 +103,6 @@
#define SrcOne (7<<4) /* Implied '1' */
#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
#define SrcImmU (9<<4) /* Immediate operand, unsigned */
-#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
-#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
-#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
-#define SrcAcc (0xd<<4) /* Source Accumulator */
-#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */
#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
#define ModRM (1<<8)
@@ -115,10 +114,8 @@
#define Stack (1<<13) /* Stack instruction (push/pop) */
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
+#define GroupMask 0xff /* Group number stored in bits 0:7 */
/* Misc flags */
-#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
-#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
-#define Undefined (1<<25) /* No Such Instruction */
#define Lock (1<<26) /* lock prefix is allowed for the instruction */
#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
#define No64 (1<<28)
@@ -127,30 +124,285 @@
#define Src2CL (1<<29)
#define Src2ImmByte (2<<29)
#define Src2One (3<<29)
-#define Src2Imm (4<<29)
+#define Src2Imm16 (4<<29)
#define Src2Mask (7<<29)
-#define X2(x...) x, x
-#define X3(x...) X2(x), x
-#define X4(x...) X2(x), X2(x)
-#define X5(x...) X4(x), x
-#define X6(x...) X4(x), X2(x)
-#define X7(x...) X4(x), X3(x)
-#define X8(x...) X4(x), X4(x)
-#define X16(x...) X8(x), X8(x)
-
-struct opcode {
- u32 flags;
- union {
- int (*execute)(struct x86_emulate_ctxt *ctxt);
- struct opcode *group;
- struct group_dual *gdual;
- } u;
+enum {
+ Group1_80, Group1_81, Group1_82, Group1_83,
+ Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
+ Group8, Group9,
};
-struct group_dual {
- struct opcode mod012[8];
- struct opcode mod3[8];
+static u32 opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+ /* 0x08 - 0x0F */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, 0,
+ /* 0x10 - 0x17 */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+ /* 0x18 - 0x1F */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+ /* 0x20 - 0x27 */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+ /* 0x28 - 0x2F */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x30 - 0x37 */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x38 - 0x3F */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ 0, 0,
+ /* 0x40 - 0x47 */
+ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+ /* 0x48 - 0x4F */
+ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+ /* 0x50 - 0x57 */
+ SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+ SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+ /* 0x58 - 0x5F */
+ DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+ DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+ /* 0x60 - 0x67 */
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+ 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
+ 0, 0, 0, 0,
+ /* 0x68 - 0x6F */
+ SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
+ /* 0x70 - 0x77 */
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ /* 0x78 - 0x7F */
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ /* 0x80 - 0x87 */
+ Group | Group1_80, Group | Group1_81,
+ Group | Group1_82, Group | Group1_83,
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ /* 0x88 - 0x8F */
+ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
+ ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
+ DstReg | SrcMem | ModRM | Mov, Group | Group1A,
+ /* 0x90 - 0x97 */
+ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+ /* 0x98 - 0x9F */
+ 0, 0, SrcImm | Src2Imm16 | No64, 0,
+ ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+ /* 0xA0 - 0xA7 */
+ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
+ ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+ ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+ ByteOp | ImplicitOps | String, ImplicitOps | String,
+ /* 0xA8 - 0xAF */
+ 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+ ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+ ByteOp | ImplicitOps | String, ImplicitOps | String,
+ /* 0xB0 - 0xB7 */
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ /* 0xB8 - 0xBF */
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ /* 0xC0 - 0xC7 */
+ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+ 0, ImplicitOps | Stack, 0, 0,
+ ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
+ /* 0xC8 - 0xCF */
+ 0, 0, 0, ImplicitOps | Stack,
+ ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
+ /* 0xD0 - 0xD7 */
+ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+ 0, 0, 0, 0,
+ /* 0xD8 - 0xDF */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xE0 - 0xE7 */
+ 0, 0, 0, 0,
+ ByteOp | SrcImmUByte, SrcImmUByte,
+ ByteOp | SrcImmUByte, SrcImmUByte,
+ /* 0xE8 - 0xEF */
+ SrcImm | Stack, SrcImm | ImplicitOps,
+ SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ /* 0xF0 - 0xF7 */
+ 0, 0, 0, 0,
+ ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
+ /* 0xF8 - 0xFF */
+ ImplicitOps, 0, ImplicitOps, ImplicitOps,
+ ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
+};
+
+static u32 twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ 0, Group | GroupDual | Group7, 0, 0,
+ 0, ImplicitOps, ImplicitOps | Priv, 0,
+ ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
+ 0, ImplicitOps | ModRM, 0, 0,
+ /* 0x10 - 0x1F */
+ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x20 - 0x2F */
+ ModRM | ImplicitOps | Priv, ModRM | Priv,
+ ModRM | ImplicitOps | Priv, ModRM | Priv,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x30 - 0x3F */
+ ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
+ ImplicitOps, ImplicitOps | Priv, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x40 - 0x47 */
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ /* 0x48 - 0x4F */
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ /* 0x50 - 0x5F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x60 - 0x6F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x70 - 0x7F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x80 - 0x8F */
+ SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
+ SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
+ /* 0x90 - 0x9F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xA0 - 0xA7 */
+ ImplicitOps | Stack, ImplicitOps | Stack,
+ 0, DstMem | SrcReg | ModRM | BitOp,
+ DstMem | SrcReg | Src2ImmByte | ModRM,
+ DstMem | SrcReg | Src2CL | ModRM, 0, 0,
+ /* 0xA8 - 0xAF */
+ ImplicitOps | Stack, ImplicitOps | Stack,
+ 0, DstMem | SrcReg | ModRM | BitOp | Lock,
+ DstMem | SrcReg | Src2ImmByte | ModRM,
+ DstMem | SrcReg | Src2CL | ModRM,
+ ModRM, 0,
+ /* 0xB0 - 0xB7 */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ 0, DstMem | SrcReg | ModRM | BitOp | Lock,
+ 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem16 | ModRM | Mov,
+ /* 0xB8 - 0xBF */
+ 0, 0,
+ Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
+ 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem16 | ModRM | Mov,
+ /* 0xC0 - 0xCF */
+ 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
+ 0, 0, 0, Group | GroupDual | Group9,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xD0 - 0xDF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xE0 - 0xEF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xF0 - 0xFF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static u32 group_table[] = {
+ [Group1_80*8] =
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM,
+ [Group1_81*8] =
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM,
+ [Group1_82*8] =
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64,
+ [Group1_83*8] =
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM,
+ [Group1A*8] =
+ DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
+ [Group3_Byte*8] =
+ ByteOp | SrcImm | DstMem | ModRM, 0,
+ ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0,
+ [Group3*8] =
+ DstMem | SrcImm | ModRM, 0,
+ DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0,
+ [Group4*8] =
+ ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0, 0, 0,
+ [Group5*8] =
+ DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ SrcMem | ModRM | Stack, 0,
+ SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
+ [Group7*8] =
+ 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
+ SrcNone | ModRM | DstMem | Mov, 0,
+ SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
+ [Group8*8] =
+ 0, 0, 0, 0,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
+ [Group9*8] =
+ 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
+};
+
+static u32 group2_table[] = {
+ [Group7*8] =
+ SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
+ SrcNone | ModRM | DstMem | Mov, 0,
+ SrcMem16 | ModRM | Mov, 0,
+ [Group9*8] =
+ 0, 0, 0, 0, 0, 0, 0, 0,
};
/* EFLAGS bit definitions. */
@@ -172,9 +424,6 @@ struct group_dual {
#define EFLG_PF (1<<2)
#define EFLG_CF (1<<0)
-#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
-#define EFLG_RESERVED_ONE_MASK 2
-
/*
* Instruction emulation:
* Most instructions are emulated directly via a fragment of inline assembly
@@ -227,13 +476,13 @@ struct group_dual {
#define ON64(x)
#endif
-#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \
do { \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0", "4", "2") \
_op _suffix " %"_x"3,%1; " \
_POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
+ : "=m" (_eflags), "=m" ((_dst).val), \
"=&r" (_tmp) \
: _y ((_src).val), "i" (EFLAGS_MASK)); \
} while (0)
@@ -246,13 +495,13 @@ struct group_dual {
\
switch ((_dst).bytes) { \
case 2: \
- ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
+ ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
break; \
case 4: \
- ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
+ ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
break; \
case 8: \
- ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
+ ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
break; \
} \
} while (0)
@@ -262,7 +511,7 @@ struct group_dual {
unsigned long _tmp; \
switch ((_dst).bytes) { \
case 1: \
- ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
+ ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \
break; \
default: \
__emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -349,91 +598,16 @@ struct group_dual {
} \
} while (0)
-#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \
- do { \
- unsigned long _tmp; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "1") \
- _op _suffix " %5; " \
- _POST_EFLAGS("0", "4", "1") \
- : "=m" (_eflags), "=&r" (_tmp), \
- "+a" (_rax), "+d" (_rdx) \
- : "i" (EFLAGS_MASK), "m" ((_src).val), \
- "a" (_rax), "d" (_rdx)); \
- } while (0)
-
-#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
- do { \
- unsigned long _tmp; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "5", "1") \
- "1: \n\t" \
- _op _suffix " %6; " \
- "2: \n\t" \
- _POST_EFLAGS("0", "5", "1") \
- ".pushsection .fixup,\"ax\" \n\t" \
- "3: movb $1, %4 \n\t" \
- "jmp 2b \n\t" \
- ".popsection \n\t" \
- _ASM_EXTABLE(1b, 3b) \
- : "=m" (_eflags), "=&r" (_tmp), \
- "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \
- : "i" (EFLAGS_MASK), "m" ((_src).val), \
- "a" (_rax), "d" (_rdx)); \
- } while (0)
-
-/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
-#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
- do { \
- switch((_src).bytes) { \
- case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
- case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \
- case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
- case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
- } \
- } while (0)
-
-#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
- do { \
- switch((_src).bytes) { \
- case 1: \
- __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
- _eflags, "b", _ex); \
- break; \
- case 2: \
- __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
- _eflags, "w", _ex); \
- break; \
- case 4: \
- __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
- _eflags, "l", _ex); \
- break; \
- case 8: ON64( \
- __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
- _eflags, "q", _ex)); \
- break; \
- } \
- } while (0)
-
/* Fetch next part of the instruction being emulated. */
#define insn_fetch(_type, _size, _eip) \
({ unsigned long _x; \
rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
- if (rc != X86EMUL_CONTINUE) \
+ if (rc != 0) \
goto done; \
(_eip) += (_size); \
(_type)_x; \
})
-#define insn_fetch_arr(_arr, _size, _eip) \
-({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
- if (rc != X86EMUL_CONTINUE) \
- goto done; \
- (_eip) += (_size); \
-})
-
static inline unsigned long ad_mask(struct decode_cache *c)
{
return (1UL << (c->ad_bytes << 3)) - 1;
@@ -450,9 +624,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
}
static inline unsigned long
-register_address(struct decode_cache *c, unsigned long reg)
+register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
{
- return address_mask(c, reg);
+ return base + address_mask(c, reg);
}
static inline void
@@ -475,102 +649,69 @@ static void set_seg_override(struct decode_cache *c, int seg)
c->seg_override = seg;
}
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
{
if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
return 0;
- return ops->get_cached_segment_base(seg, ctxt->vcpu);
+ return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
}
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct decode_cache *c)
+static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
+ struct decode_cache *c)
{
if (!c->has_seg_override)
return 0;
- return c->seg_override;
-}
-
-static ulong linear(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr)
-{
- struct decode_cache *c = &ctxt->decode;
- ulong la;
-
- la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
- if (c->ad_bytes != 8)
- la &= (u32)-1;
- return la;
-}
-
-static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
- u32 error, bool valid)
-{
- ctxt->exception.vector = vec;
- ctxt->exception.error_code = error;
- ctxt->exception.error_code_valid = valid;
- return X86EMUL_PROPAGATE_FAULT;
-}
-
-static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
-{
- return emulate_exception(ctxt, GP_VECTOR, err, true);
-}
-
-static int emulate_ud(struct x86_emulate_ctxt *ctxt)
-{
- return emulate_exception(ctxt, UD_VECTOR, 0, false);
+ return seg_base(ctxt, c->seg_override);
}
-static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
{
- return emulate_exception(ctxt, TS_VECTOR, err, true);
+ return seg_base(ctxt, VCPU_SREG_ES);
}
-static int emulate_de(struct x86_emulate_ctxt *ctxt)
+static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
{
- return emulate_exception(ctxt, DE_VECTOR, 0, false);
+ return seg_base(ctxt, VCPU_SREG_SS);
}
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
- unsigned long eip, u8 *dest)
+ unsigned long linear, u8 *dest)
{
struct fetch_cache *fc = &ctxt->decode.fetch;
int rc;
- int size, cur_size;
+ int size;
- if (eip == fc->end) {
- cur_size = fc->end - fc->start;
- size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
- rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
- size, ctxt->vcpu, &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
+ if (linear < fc->start || linear >= fc->end) {
+ size = min(15UL, PAGE_SIZE - offset_in_page(linear));
+ rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
+ if (rc)
return rc;
- fc->end += size;
+ fc->start = linear;
+ fc->end = linear + size;
}
- *dest = fc->data[eip - fc->start];
- return X86EMUL_CONTINUE;
+ *dest = fc->data[linear - fc->start];
+ return 0;
}
static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned long eip, void *dest, unsigned size)
{
- int rc;
+ int rc = 0;
/* x86 instructions are limited to 15 bytes. */
- if (eip + size - ctxt->eip > 15)
+ if (eip + size - ctxt->decode.eip_orig > 15)
return X86EMUL_UNHANDLEABLE;
+ eip += ctxt->cs_base;
while (size--) {
rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
- if (rc != X86EMUL_CONTINUE)
+ if (rc)
return rc;
}
- return X86EMUL_CONTINUE;
+ return 0;
}
/*
@@ -591,7 +732,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
static int read_descriptor(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
- struct segmented_address addr,
+ void *ptr,
u16 *size, unsigned long *address, int op_bytes)
{
int rc;
@@ -599,13 +740,12 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
if (op_bytes == 2)
op_bytes = 3;
*address = 0;
- rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
- ctxt->vcpu, &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
+ rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
+ ctxt->vcpu, NULL);
+ if (rc)
return rc;
- addr.ea += 2;
- rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
- ctxt->vcpu, &ctxt->exception);
+ rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
+ ctxt->vcpu, NULL);
return rc;
}
@@ -644,24 +784,6 @@ static int test_cc(unsigned int condition, unsigned int flags)
return (!!rc ^ (condition & 1));
}
-static void fetch_register_operand(struct operand *op)
-{
- switch (op->bytes) {
- case 1:
- op->val = *(u8 *)op->addr.reg;
- break;
- case 2:
- op->val = *(u16 *)op->addr.reg;
- break;
- case 4:
- op->val = *(u32 *)op->addr.reg;
- break;
- case 8:
- op->val = *(u64 *)op->addr.reg;
- break;
- }
-}
-
static void decode_register_operand(struct operand *op,
struct decode_cache *c,
int inhibit_bytereg)
@@ -673,25 +795,34 @@ static void decode_register_operand(struct operand *op,
reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
op->type = OP_REG;
if ((c->d & ByteOp) && !inhibit_bytereg) {
- op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
+ op->ptr = decode_register(reg, c->regs, highbyte_regs);
+ op->val = *(u8 *)op->ptr;
op->bytes = 1;
} else {
- op->addr.reg = decode_register(reg, c->regs, 0);
+ op->ptr = decode_register(reg, c->regs, 0);
op->bytes = c->op_bytes;
+ switch (op->bytes) {
+ case 2:
+ op->val = *(u16 *)op->ptr;
+ break;
+ case 4:
+ op->val = *(u32 *)op->ptr;
+ break;
+ case 8:
+ op->val = *(u64 *) op->ptr;
+ break;
+ }
}
- fetch_register_operand(op);
op->orig_val = op->val;
}
static int decode_modrm(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct operand *op)
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
u8 sib;
int index_reg = 0, base_reg = 0, scale;
- int rc = X86EMUL_CONTINUE;
- ulong modrm_ea = 0;
+ int rc = 0;
if (c->rex_prefix) {
c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -703,19 +834,16 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
c->modrm_mod |= (c->modrm & 0xc0) >> 6;
c->modrm_reg |= (c->modrm & 0x38) >> 3;
c->modrm_rm |= (c->modrm & 0x07);
- c->modrm_seg = VCPU_SREG_DS;
+ c->modrm_ea = 0;
+ c->use_modrm_ea = 1;
if (c->modrm_mod == 3) {
- op->type = OP_REG;
- op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- op->addr.reg = decode_register(c->modrm_rm,
+ c->modrm_ptr = decode_register(c->modrm_rm,
c->regs, c->d & ByteOp);
- fetch_register_operand(op);
+ c->modrm_val = *(unsigned long *)c->modrm_ptr;
return rc;
}
- op->type = OP_MEM;
-
if (c->ad_bytes == 2) {
unsigned bx = c->regs[VCPU_REGS_RBX];
unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -726,46 +854,47 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
switch (c->modrm_mod) {
case 0:
if (c->modrm_rm == 6)
- modrm_ea += insn_fetch(u16, 2, c->eip);
+ c->modrm_ea += insn_fetch(u16, 2, c->eip);
break;
case 1:
- modrm_ea += insn_fetch(s8, 1, c->eip);
+ c->modrm_ea += insn_fetch(s8, 1, c->eip);
break;
case 2:
- modrm_ea += insn_fetch(u16, 2, c->eip);
+ c->modrm_ea += insn_fetch(u16, 2, c->eip);
break;
}
switch (c->modrm_rm) {
case 0:
- modrm_ea += bx + si;
+ c->modrm_ea += bx + si;
break;
case 1:
- modrm_ea += bx + di;
+ c->modrm_ea += bx + di;
break;
case 2:
- modrm_ea += bp + si;
+ c->modrm_ea += bp + si;
break;
case 3:
- modrm_ea += bp + di;
+ c->modrm_ea += bp + di;
break;
case 4:
- modrm_ea += si;
+ c->modrm_ea += si;
break;
case 5:
- modrm_ea += di;
+ c->modrm_ea += di;
break;
case 6:
if (c->modrm_mod != 0)
- modrm_ea += bp;
+ c->modrm_ea += bp;
break;
case 7:
- modrm_ea += bx;
+ c->modrm_ea += bx;
break;
}
if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
(c->modrm_rm == 6 && c->modrm_mod != 0))
- c->modrm_seg = VCPU_SREG_SS;
- modrm_ea = (u16)modrm_ea;
+ if (!c->has_seg_override)
+ set_seg_override(c, VCPU_SREG_SS);
+ c->modrm_ea = (u16)c->modrm_ea;
} else {
/* 32/64-bit ModR/M decode. */
if ((c->modrm_rm & 7) == 4) {
@@ -775,377 +904,358 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
scale = sib >> 6;
if ((base_reg & 7) == 5 && c->modrm_mod == 0)
- modrm_ea += insn_fetch(s32, 4, c->eip);
+ c->modrm_ea += insn_fetch(s32, 4, c->eip);
else
- modrm_ea += c->regs[base_reg];
+ c->modrm_ea += c->regs[base_reg];
if (index_reg != 4)
- modrm_ea += c->regs[index_reg] << scale;
+ c->modrm_ea += c->regs[index_reg] << scale;
} else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
if (ctxt->mode == X86EMUL_MODE_PROT64)
c->rip_relative = 1;
} else
- modrm_ea += c->regs[c->modrm_rm];
+ c->modrm_ea += c->regs[c->modrm_rm];
switch (c->modrm_mod) {
case 0:
if (c->modrm_rm == 5)
- modrm_ea += insn_fetch(s32, 4, c->eip);
+ c->modrm_ea += insn_fetch(s32, 4, c->eip);
break;
case 1:
- modrm_ea += insn_fetch(s8, 1, c->eip);
+ c->modrm_ea += insn_fetch(s8, 1, c->eip);
break;
case 2:
- modrm_ea += insn_fetch(s32, 4, c->eip);
+ c->modrm_ea += insn_fetch(s32, 4, c->eip);
break;
}
}
- op->addr.mem.ea = modrm_ea;
done:
return rc;
}
static int decode_abs(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct operand *op)
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc = X86EMUL_CONTINUE;
+ int rc = 0;
- op->type = OP_MEM;
switch (c->ad_bytes) {
case 2:
- op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
+ c->modrm_ea = insn_fetch(u16, 2, c->eip);
break;
case 4:
- op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
+ c->modrm_ea = insn_fetch(u32, 4, c->eip);
break;
case 8:
- op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
+ c->modrm_ea = insn_fetch(u64, 8, c->eip);
break;
}
done:
return rc;
}
-static void fetch_bit_operand(struct decode_cache *c)
+int
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
- long sv = 0, mask;
+ struct decode_cache *c = &ctxt->decode;
+ int rc = 0;
+ int mode = ctxt->mode;
+ int def_op_bytes, def_ad_bytes, group;
- if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
- mask = ~(c->dst.bytes * 8 - 1);
+ /* Shadow copy of register state. Committed on successful emulation. */
- if (c->src.bytes == 2)
- sv = (s16)c->src.val & (s16)mask;
- else if (c->src.bytes == 4)
- sv = (s32)c->src.val & (s32)mask;
+ memset(c, 0, sizeof(struct decode_cache));
+ c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
+ ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
+ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
- c->dst.addr.mem.ea += (sv >> 3);
+ switch (mode) {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ def_op_bytes = def_ad_bytes = 2;
+ break;
+ case X86EMUL_MODE_PROT32:
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86EMUL_MODE_PROT64:
+ def_op_bytes = 4;
+ def_ad_bytes = 8;
+ break;
+#endif
+ default:
+ return -1;
}
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
-}
-
-static int read_emulated(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- unsigned long addr, void *dest, unsigned size)
-{
- int rc;
- struct read_cache *mc = &ctxt->decode.mem_read;
-
- while (size) {
- int n = min(size, 8u);
- size -= n;
- if (mc->pos < mc->end)
- goto read_cached;
+ c->op_bytes = def_op_bytes;
+ c->ad_bytes = def_ad_bytes;
- rc = ops->read_emulated(addr, mc->data + mc->end, n,
- &ctxt->exception, ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- mc->end += n;
+ /* Legacy prefixes. */
+ for (;;) {
+ switch (c->b = insn_fetch(u8, 1, c->eip)) {
+ case 0x66: /* operand-size override */
+ /* switch between 2/4 bytes */
+ c->op_bytes = def_op_bytes ^ 6;
+ break;
+ case 0x67: /* address-size override */
+ if (mode == X86EMUL_MODE_PROT64)
+ /* switch between 4/8 bytes */
+ c->ad_bytes = def_ad_bytes ^ 12;
+ else
+ /* switch between 2/4 bytes */
+ c->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x26: /* ES override */
+ case 0x2e: /* CS override */
+ case 0x36: /* SS override */
+ case 0x3e: /* DS override */
+ set_seg_override(c, (c->b >> 3) & 3);
+ break;
+ case 0x64: /* FS override */
+ case 0x65: /* GS override */
+ set_seg_override(c, c->b & 7);
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ c->rex_prefix = c->b;
+ continue;
+ case 0xf0: /* LOCK */
+ c->lock_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ c->rep_prefix = REPNE_PREFIX;
+ break;
+ case 0xf3: /* REP/REPE/REPZ */
+ c->rep_prefix = REPE_PREFIX;
+ break;
+ default:
+ goto done_prefixes;
+ }
- read_cached:
- memcpy(dest, mc->data + mc->pos, n);
- mc->pos += n;
- dest += n;
- addr += n;
- }
- return X86EMUL_CONTINUE;
-}
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
-static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- unsigned int size, unsigned short port,
- void *dest)
-{
- struct read_cache *rc = &ctxt->decode.io_read;
-
- if (rc->pos == rc->end) { /* refill pio read ahead */
- struct decode_cache *c = &ctxt->decode;
- unsigned int in_page, n;
- unsigned int count = c->rep_prefix ?
- address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
- in_page = (ctxt->eflags & EFLG_DF) ?
- offset_in_page(c->regs[VCPU_REGS_RDI]) :
- PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
- n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
- count);
- if (n == 0)
- n = 1;
- rc->pos = rc->end = 0;
- if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
- return 0;
- rc->end = n * size;
+ c->rex_prefix = 0;
}
- memcpy(dest, rc->data + rc->pos, size);
- rc->pos += size;
- return 1;
-}
-
-static u32 desc_limit_scaled(struct kvm_desc_struct *desc)
-{
- u32 limit = kvm_get_desc_limit(desc);
-
- return desc->g ? (limit << 12) | 0xfff : limit;
-}
-
-static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 selector, struct kvm_desc_ptr *dt)
-{
- if (selector & 1 << 2) {
- struct kvm_desc_struct desc;
- memset (dt, 0, sizeof *dt);
- if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
- return;
-
- dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
- dt->address = kvm_get_desc_base(&desc);
- } else
- ops->get_gdt(dt, ctxt->vcpu);
-}
-
-/* allowed just for 8 bytes segments */
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 selector, struct kvm_desc_struct *desc)
-{
- struct kvm_desc_ptr dt;
- u16 index = selector >> 3;
- int ret;
- ulong addr;
-
- get_descriptor_table_ptr(ctxt, ops, selector, &dt);
-
- if (dt.size < index * 8 + 7)
- return emulate_gp(ctxt, selector & 0xfffc);
- addr = dt.address + index * 8;
- ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
- &ctxt->exception);
-
- return ret;
-}
-
-/* allowed just for 8 bytes segments */
-static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 selector, struct kvm_desc_struct *desc)
-{
- struct kvm_desc_ptr dt;
- u16 index = selector >> 3;
- ulong addr;
- int ret;
-
- get_descriptor_table_ptr(ctxt, ops, selector, &dt);
-
- if (dt.size < index * 8 + 7)
- return emulate_gp(ctxt, selector & 0xfffc);
-
- addr = dt.address + index * 8;
- ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
- &ctxt->exception);
+done_prefixes:
- return ret;
-}
+ /* REX prefix. */
+ if (c->rex_prefix)
+ if (c->rex_prefix & 8)
+ c->op_bytes = 8; /* REX.W */
-static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 selector, int seg)
-{
- struct kvm_desc_struct seg_desc;
- u8 dpl, rpl, cpl;
- unsigned err_vec = GP_VECTOR;
- u32 err_code = 0;
- bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
- int ret;
-
- memset(&seg_desc, 0, sizeof seg_desc);
-
- if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
- || ctxt->mode == X86EMUL_MODE_REAL) {
- /* set real mode segment descriptor */
- kvm_set_desc_base(&seg_desc, selector << 4);
- kvm_set_desc_limit(&seg_desc, 0xffff);
- seg_desc.type = 3;
- seg_desc.p = 1;
- seg_desc.s = 1;
- goto load;
+ /* Opcode byte(s). */
+ c->d = opcode_table[c->b];
+ if (c->d == 0) {
+ /* Two-byte opcode? */
+ if (c->b == 0x0f) {
+ c->twobyte = 1;
+ c->b = insn_fetch(u8, 1, c->eip);
+ c->d = twobyte_table[c->b];
+ }
}
- /* NULL selector is not valid for TR, CS and SS */
- if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
- && null_selector)
- goto exception;
+ if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+ kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
+ return -1;
+ }
- /* TR should be in GDT only */
- if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
- goto exception;
+ if (c->d & Group) {
+ group = c->d & GroupMask;
+ c->modrm = insn_fetch(u8, 1, c->eip);
+ --c->eip;
- if (null_selector) /* for NULL selector skip all following checks */
- goto load;
+ group = (group << 3) + ((c->modrm >> 3) & 7);
+ if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
+ c->d = group2_table[group];
+ else
+ c->d = group_table[group];
+ }
- ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
+ /* Unrecognised? */
+ if (c->d == 0) {
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ return -1;
+ }
- err_code = selector & 0xfffc;
- err_vec = GP_VECTOR;
+ if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+ c->op_bytes = 8;
- /* can't load system descriptor into segment selecor */
- if (seg <= VCPU_SREG_GS && !seg_desc.s)
- goto exception;
+ /* ModRM and SIB bytes. */
+ if (c->d & ModRM)
+ rc = decode_modrm(ctxt, ops);
+ else if (c->d & MemAbs)
+ rc = decode_abs(ctxt, ops);
+ if (rc)
+ goto done;
- if (!seg_desc.p) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
- goto exception;
- }
+ if (!c->has_seg_override)
+ set_seg_override(c, VCPU_SREG_DS);
- rpl = selector & 3;
- dpl = seg_desc.dpl;
- cpl = ops->cpl(ctxt->vcpu);
+ if (!(!c->twobyte && c->b == 0x8d))
+ c->modrm_ea += seg_override_base(ctxt, c);
- switch (seg) {
- case VCPU_SREG_SS:
+ if (c->ad_bytes != 8)
+ c->modrm_ea = (u32)c->modrm_ea;
+ /*
+ * Decode and fetch the source operand: register, memory
+ * or immediate.
+ */
+ switch (c->d & SrcMask) {
+ case SrcNone:
+ break;
+ case SrcReg:
+ decode_register_operand(&c->src, c, 0);
+ break;
+ case SrcMem16:
+ c->src.bytes = 2;
+ goto srcmem_common;
+ case SrcMem32:
+ c->src.bytes = 4;
+ goto srcmem_common;
+ case SrcMem:
+ c->src.bytes = (c->d & ByteOp) ? 1 :
+ c->op_bytes;
+ /* Don't fetch the address for invlpg: it could be unmapped. */
+ if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
+ break;
+ srcmem_common:
/*
- * segment is not a writable data segment or segment
- * selector's RPL != CPL or segment selector's RPL != CPL
+ * For instructions with a ModR/M byte, switch to register
+ * access if Mod = 3.
*/
- if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
- goto exception;
- break;
- case VCPU_SREG_CS:
- if (!(seg_desc.type & 8))
- goto exception;
-
- if (seg_desc.type & 4) {
- /* conforming */
- if (dpl > cpl)
- goto exception;
- } else {
- /* nonconforming */
- if (rpl > cpl || dpl != cpl)
- goto exception;
+ if ((c->d & ModRM) && c->modrm_mod == 3) {
+ c->src.type = OP_REG;
+ c->src.val = c->modrm_val;
+ c->src.ptr = c->modrm_ptr;
+ break;
}
- /* CS(RPL) <- CPL */
- selector = (selector & 0xfffc) | cpl;
+ c->src.type = OP_MEM;
break;
- case VCPU_SREG_TR:
- if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
- goto exception;
+ case SrcImm:
+ case SrcImmU:
+ c->src.type = OP_IMM;
+ c->src.ptr = (unsigned long *)c->eip;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ if (c->src.bytes == 8)
+ c->src.bytes = 4;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val = insn_fetch(s8, 1, c->eip);
+ break;
+ case 2:
+ c->src.val = insn_fetch(s16, 2, c->eip);
+ break;
+ case 4:
+ c->src.val = insn_fetch(s32, 4, c->eip);
+ break;
+ }
+ if ((c->d & SrcMask) == SrcImmU) {
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val &= 0xff;
+ break;
+ case 2:
+ c->src.val &= 0xffff;
+ break;
+ case 4:
+ c->src.val &= 0xffffffff;
+ break;
+ }
+ }
break;
- case VCPU_SREG_LDTR:
- if (seg_desc.s || seg_desc.type != 2)
- goto exception;
+ case SrcImmByte:
+ case SrcImmUByte:
+ c->src.type = OP_IMM;
+ c->src.ptr = (unsigned long *)c->eip;
+ c->src.bytes = 1;
+ if ((c->d & SrcMask) == SrcImmByte)
+ c->src.val = insn_fetch(s8, 1, c->eip);
+ else
+ c->src.val = insn_fetch(u8, 1, c->eip);
break;
- default: /* DS, ES, FS, or GS */
- /*
- * segment is not a data or readable code segment or
- * ((segment is a data or nonconforming code segment)
- * and (both RPL and CPL > DPL))
- */
- if ((seg_desc.type & 0xa) == 0x8 ||
- (((seg_desc.type & 0xc) != 0xc) &&
- (rpl > dpl && cpl > dpl)))
- goto exception;
+ case SrcOne:
+ c->src.bytes = 1;
+ c->src.val = 1;
break;
}
- if (seg_desc.s) {
- /* mark segment as accessed */
- seg_desc.type |= 1;
- ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- }
-load:
- ops->set_segment_selector(selector, seg, ctxt->vcpu);
- ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
- return X86EMUL_CONTINUE;
-exception:
- emulate_exception(ctxt, err_vec, err_code, true);
- return X86EMUL_PROPAGATE_FAULT;
-}
-
-static void write_register_operand(struct operand *op)
-{
- /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
- switch (op->bytes) {
- case 1:
- *(u8 *)op->addr.reg = (u8)op->val;
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
+ */
+ switch (c->d & Src2Mask) {
+ case Src2None:
break;
- case 2:
- *(u16 *)op->addr.reg = (u16)op->val;
+ case Src2CL:
+ c->src2.bytes = 1;
+ c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
break;
- case 4:
- *op->addr.reg = (u32)op->val;
- break; /* 64b: zero-extend */
- case 8:
- *op->addr.reg = op->val;
+ case Src2ImmByte:
+ c->src2.type = OP_IMM;
+ c->src2.ptr = (unsigned long *)c->eip;
+ c->src2.bytes = 1;
+ c->src2.val = insn_fetch(u8, 1, c->eip);
+ break;
+ case Src2Imm16:
+ c->src2.type = OP_IMM;
+ c->src2.ptr = (unsigned long *)c->eip;
+ c->src2.bytes = 2;
+ c->src2.val = insn_fetch(u16, 2, c->eip);
+ break;
+ case Src2One:
+ c->src2.bytes = 1;
+ c->src2.val = 1;
break;
}
-}
-
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- int rc;
- struct decode_cache *c = &ctxt->decode;
- switch (c->dst.type) {
- case OP_REG:
- write_register_operand(&c->dst);
- break;
- case OP_MEM:
- if (c->lock_prefix)
- rc = ops->cmpxchg_emulated(
- linear(ctxt, c->dst.addr.mem),
- &c->dst.orig_val,
- &c->dst.val,
- c->dst.bytes,
- &ctxt->exception,
- ctxt->vcpu);
- else
- rc = ops->write_emulated(
- linear(ctxt, c->dst.addr.mem),
- &c->dst.val,
- c->dst.bytes,
- &ctxt->exception,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+ /* Decode and fetch the destination operand: register or memory. */
+ switch (c->d & DstMask) {
+ case ImplicitOps:
+ /* Special instructions do their own operand decoding. */
+ return 0;
+ case DstReg:
+ decode_register_operand(&c->dst, c,
+ c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
break;
- case OP_NONE:
- /* no writeback */
+ case DstMem:
+ if ((c->d & ModRM) && c->modrm_mod == 3) {
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.type = OP_REG;
+ c->dst.val = c->dst.orig_val = c->modrm_val;
+ c->dst.ptr = c->modrm_ptr;
+ break;
+ }
+ c->dst.type = OP_MEM;
break;
- default:
+ case DstAcc:
+ c->dst.type = OP_REG;
+ c->dst.bytes = c->op_bytes;
+ c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+ switch (c->op_bytes) {
+ case 1:
+ c->dst.val = *(u8 *)c->dst.ptr;
+ break;
+ case 2:
+ c->dst.val = *(u16 *)c->dst.ptr;
+ break;
+ case 4:
+ c->dst.val = *(u32 *)c->dst.ptr;
+ break;
+ }
+ c->dst.orig_val = c->dst.val;
break;
}
- return X86EMUL_CONTINUE;
+
+ if (c->rip_relative)
+ c->modrm_ea += c->eip;
+
+done:
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
}
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
@@ -1153,8 +1263,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
- c->dst.addr.mem.seg = VCPU_SREG_SS;
+ c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
+ c->regs[VCPU_REGS_RSP]);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1163,11 +1273,10 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
{
struct decode_cache *c = &ctxt->decode;
int rc;
- struct segmented_address addr;
- addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
- addr.seg = VCPU_SREG_SS;
- rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
+ rc = ops->read_emulated(register_address(c, ss_base(ctxt),
+ c->regs[VCPU_REGS_RSP]),
+ dest, len, ctxt->vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1182,7 +1291,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
int rc;
unsigned long val, change_mask;
int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- int cpl = ops->cpl(ctxt->vcpu);
+ int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
rc = emulate_pop(ctxt, ops, &val, len);
if (rc != X86EMUL_CONTINUE)
@@ -1201,8 +1310,10 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
change_mask |= EFLG_IF;
break;
case X86EMUL_MODE_VM86:
- if (iopl < 3)
- return emulate_gp(ctxt, 0);
+ if (iopl < 3) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
change_mask |= EFLG_IF;
break;
default: /* real mode */
@@ -1216,14 +1327,15 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int seg)
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
{
struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment segment;
- c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
+ kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
- emulate_push(ctxt, ops);
+ c->src.val = segment.selector;
+ emulate_push(ctxt);
}
static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1234,45 +1346,33 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
int rc;
rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
+ if (rc != 0)
return rc;
- rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
return rc;
}
-static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
unsigned long old_esp = c->regs[VCPU_REGS_RSP];
- int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RAX;
while (reg <= VCPU_REGS_RDI) {
(reg == VCPU_REGS_RSP) ?
(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
- emulate_push(ctxt, ops);
-
- rc = writeback(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
+ emulate_push(ctxt);
++reg;
}
-
- /* Disable writeback. */
- c->dst.type = OP_NONE;
-
- return rc;
}
static int emulate_popa(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc = X86EMUL_CONTINUE;
+ int rc = 0;
int reg = VCPU_REGS_RDI;
while (reg >= VCPU_REGS_RAX) {
@@ -1283,160 +1383,23 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
}
rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
+ if (rc != 0)
break;
--reg;
}
return rc;
}
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int irq)
-{
- struct decode_cache *c = &ctxt->decode;
- int rc;
- struct kvm_desc_ptr dt;
- gva_t cs_addr;
- gva_t eip_addr;
- u16 cs, eip;
-
- /* TODO: Add limit checks */
- c->src.val = ctxt->eflags;
- emulate_push(ctxt, ops);
- rc = writeback(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
-
- c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- emulate_push(ctxt, ops);
- rc = writeback(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- c->src.val = c->eip;
- emulate_push(ctxt, ops);
- rc = writeback(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- c->dst.type = OP_NONE;
-
- ops->get_idt(&dt, ctxt->vcpu);
-
- eip_addr = dt.address + (irq << 2);
- cs_addr = dt.address + (irq << 2) + 2;
-
- rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- c->eip = eip;
-
- return rc;
-}
-
-static int emulate_int(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int irq)
-{
- switch(ctxt->mode) {
- case X86EMUL_MODE_REAL:
- return emulate_int_real(ctxt, ops, irq);
- case X86EMUL_MODE_VM86:
- case X86EMUL_MODE_PROT16:
- case X86EMUL_MODE_PROT32:
- case X86EMUL_MODE_PROT64:
- default:
- /* Protected mode interrupts unimplemented yet */
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
-static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- struct decode_cache *c = &ctxt->decode;
- int rc = X86EMUL_CONTINUE;
- unsigned long temp_eip = 0;
- unsigned long temp_eflags = 0;
- unsigned long cs = 0;
- unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
- EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
- EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
- unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
-
- /* TODO: Add stack limit check */
-
- rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
-
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- if (temp_eip & ~0xffff)
- return emulate_gp(ctxt, 0);
-
- rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
-
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
-
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
-
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- c->eip = temp_eip;
-
-
- if (c->op_bytes == 4)
- ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
- else if (c->op_bytes == 2) {
- ctxt->eflags &= ~0xffff;
- ctxt->eflags |= temp_eflags;
- }
-
- ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
- ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
-
- return rc;
-}
-
-static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops* ops)
-{
- switch(ctxt->mode) {
- case X86EMUL_MODE_REAL:
- return emulate_iret_real(ctxt, ops);
- case X86EMUL_MODE_VM86:
- case X86EMUL_MODE_PROT16:
- case X86EMUL_MODE_PROT32:
- case X86EMUL_MODE_PROT64:
- default:
- /* iret from protected mode unimplemented yet */
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
+ int rc;
- return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
+ rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
+ if (rc != 0)
+ return rc;
+ return 0;
}
static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1472,9 +1435,7 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- unsigned long *rax = &c->regs[VCPU_REGS_RAX];
- unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
- u8 de = 0;
+ int rc = 0;
switch (c->modrm_reg) {
case 0 ... 1: /* test */
@@ -1486,26 +1447,12 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
case 3: /* neg */
emulate_1op("neg", c->dst, ctxt->eflags);
break;
- case 4: /* mul */
- emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
- break;
- case 5: /* imul */
- emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
- break;
- case 6: /* div */
- emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
- ctxt->eflags, de);
- break;
- case 7: /* idiv */
- emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
- ctxt->eflags, de);
- break;
default:
- return X86EMUL_UNHANDLEABLE;
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ rc = X86EMUL_UNHANDLEABLE;
+ break;
}
- if (de)
- return emulate_de(ctxt);
- return X86EMUL_CONTINUE;
+ return rc;
}
static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1525,37 +1472,48 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
old_eip = c->eip;
c->eip = c->src.val;
c->src.val = old_eip;
- emulate_push(ctxt, ops);
+ emulate_push(ctxt);
break;
}
case 4: /* jmp abs */
c->eip = c->src.val;
break;
case 6: /* push */
- emulate_push(ctxt, ops);
+ emulate_push(ctxt);
break;
}
- return X86EMUL_CONTINUE;
+ return 0;
}
static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+ struct x86_emulate_ops *ops,
+ unsigned long memop)
{
struct decode_cache *c = &ctxt->decode;
- u64 old = c->dst.orig_val64;
+ u64 old, new;
+ int rc;
+
+ rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
+
c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
ctxt->eflags &= ~EFLG_ZF;
+
} else {
- c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
- (u32) c->regs[VCPU_REGS_RBX];
+ new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+ (u32) c->regs[VCPU_REGS_RBX];
+ rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
ctxt->eflags |= EFLG_ZF;
}
- return X86EMUL_CONTINUE;
+ return 0;
}
static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1566,108 +1524,153 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
unsigned long cs;
rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
+ if (rc)
return rc;
if (c->op_bytes == 4)
c->eip = (u32)c->eip;
rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
+ if (rc)
return rc;
- rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
return rc;
}
-static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int seg)
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
- struct decode_cache *c = &ctxt->decode;
- unsigned short sel;
int rc;
+ struct decode_cache *c = &ctxt->decode;
- memcpy(&sel, c->src.valptr + c->op_bytes, 2);
-
- rc = load_segment_descriptor(ctxt, ops, sel, seg);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+ switch (c->dst.type) {
+ case OP_REG:
+ /* The 4-byte case *is* correct:
+ * in 64-bit mode we zero-extend.
+ */
+ switch (c->dst.bytes) {
+ case 1:
+ *(u8 *)c->dst.ptr = (u8)c->dst.val;
+ break;
+ case 2:
+ *(u16 *)c->dst.ptr = (u16)c->dst.val;
+ break;
+ case 4:
+ *c->dst.ptr = (u32)c->dst.val;
+ break; /* 64b: zero-ext */
+ case 8:
+ *c->dst.ptr = c->dst.val;
+ break;
+ }
+ break;
+ case OP_MEM:
+ if (c->lock_prefix)
+ rc = ops->cmpxchg_emulated(
+ (unsigned long)c->dst.ptr,
+ &c->dst.orig_val,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ else
+ rc = ops->write_emulated(
+ (unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
- c->dst.val = c->src.val;
- return rc;
+static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
+{
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (!(int_shadow & mask))
+ ctxt->interruptibility = mask;
}
static inline void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, struct kvm_desc_struct *cs,
- struct kvm_desc_struct *ss)
+ struct kvm_segment *cs, struct kvm_segment *ss)
{
- memset(cs, 0, sizeof(struct kvm_desc_struct));
- ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
- memset(ss, 0, sizeof(struct kvm_desc_struct));
+ memset(cs, 0, sizeof(struct kvm_segment));
+ kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
+ memset(ss, 0, sizeof(struct kvm_segment));
cs->l = 0; /* will be adjusted later */
- kvm_set_desc_base(cs, 0); /* flat segment */
+ cs->base = 0; /* flat segment */
cs->g = 1; /* 4kb granularity */
- kvm_set_desc_limit(cs, 0xfffff); /* 4GB limit */
+ cs->limit = 0xffffffff; /* 4GB limit */
cs->type = 0x0b; /* Read, Execute, Accessed */
cs->s = 1;
cs->dpl = 0; /* will be adjusted later */
- cs->p = 1;
- cs->d = 1;
+ cs->present = 1;
+ cs->db = 1;
- kvm_set_desc_base(ss, 0); /* flat segment */
- kvm_set_desc_limit(ss, 0xfffff); /* 4GB limit */
+ ss->unusable = 0;
+ ss->base = 0; /* flat segment */
+ ss->limit = 0xffffffff; /* 4GB limit */
ss->g = 1; /* 4kb granularity */
ss->s = 1;
ss->type = 0x03; /* Read/Write, Accessed */
- ss->d = 1; /* 32bit stack segment */
+ ss->db = 1; /* 32bit stack segment */
ss->dpl = 0;
- ss->p = 1;
+ ss->present = 1;
}
static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+emulate_syscall(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_desc_struct cs, ss;
+ struct kvm_segment cs, ss;
u64 msr_data;
- u16 cs_sel, ss_sel;
/* syscall is not available in real mode */
- if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86)
- return emulate_ud(ctxt);
+ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
+ return X86EMUL_UNHANDLEABLE;
- setup_syscalls_segments(ctxt, ops, &cs, &ss);
+ setup_syscalls_segments(ctxt, &cs, &ss);
- ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
msr_data >>= 32;
- cs_sel = (u16)(msr_data & 0xfffc);
- ss_sel = (u16)(msr_data + 8);
+ cs.selector = (u16)(msr_data & 0xfffc);
+ ss.selector = (u16)(msr_data + 8);
if (is_long_mode(ctxt->vcpu)) {
- cs.d = 0;
+ cs.db = 0;
cs.l = 1;
}
- ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
c->regs[VCPU_REGS_RCX] = c->eip;
if (is_long_mode(ctxt->vcpu)) {
#ifdef CONFIG_X86_64
c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
- ops->get_msr(ctxt->vcpu,
- ctxt->mode == X86EMUL_MODE_PROT64 ?
- MSR_LSTAR : MSR_CSTAR, &msr_data);
+ kvm_x86_ops->get_msr(ctxt->vcpu,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
c->eip = msr_data;
- ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
ctxt->eflags &= ~(msr_data | EFLG_RF);
#endif
} else {
/* legacy mode */
- ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
c->eip = (u32)msr_data;
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1677,77 +1680,81 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
}
static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+emulate_sysenter(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_desc_struct cs, ss;
+ struct kvm_segment cs, ss;
u64 msr_data;
- u16 cs_sel, ss_sel;
/* inject #GP if in real mode */
- if (ctxt->mode == X86EMUL_MODE_REAL)
- return emulate_gp(ctxt, 0);
+ if (ctxt->mode == X86EMUL_MODE_REAL) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_UNHANDLEABLE;
+ }
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
if (ctxt->mode == X86EMUL_MODE_PROT64)
- return emulate_ud(ctxt);
+ return X86EMUL_UNHANDLEABLE;
- setup_syscalls_segments(ctxt, ops, &cs, &ss);
+ setup_syscalls_segments(ctxt, &cs, &ss);
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (ctxt->mode) {
case X86EMUL_MODE_PROT32:
- if ((msr_data & 0xfffc) == 0x0)
- return emulate_gp(ctxt, 0);
+ if ((msr_data & 0xfffc) == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
break;
case X86EMUL_MODE_PROT64:
- if (msr_data == 0x0)
- return emulate_gp(ctxt, 0);
+ if (msr_data == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
break;
}
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
- cs_sel = (u16)msr_data;
- cs_sel &= ~SELECTOR_RPL_MASK;
- ss_sel = cs_sel + 8;
- ss_sel &= ~SELECTOR_RPL_MASK;
+ cs.selector = (u16)msr_data;
+ cs.selector &= ~SELECTOR_RPL_MASK;
+ ss.selector = cs.selector + 8;
+ ss.selector &= ~SELECTOR_RPL_MASK;
if (ctxt->mode == X86EMUL_MODE_PROT64
|| is_long_mode(ctxt->vcpu)) {
- cs.d = 0;
+ cs.db = 0;
cs.l = 1;
}
- ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
c->eip = msr_data;
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
c->regs[VCPU_REGS_RSP] = msr_data;
return X86EMUL_CONTINUE;
}
static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+emulate_sysexit(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_desc_struct cs, ss;
+ struct kvm_segment cs, ss;
u64 msr_data;
int usermode;
- u16 cs_sel, ss_sel;
/* inject #GP if in real mode or Virtual 8086 mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86)
- return emulate_gp(ctxt, 0);
+ ctxt->mode == X86EMUL_MODE_VM86) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_UNHANDLEABLE;
+ }
- setup_syscalls_segments(ctxt, ops, &cs, &ss);
+ setup_syscalls_segments(ctxt, &cs, &ss);
if ((c->rex_prefix & 0x8) != 0x0)
usermode = X86EMUL_MODE_PROT64;
@@ -1756,39 +1763,40 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs.dpl = 3;
ss.dpl = 3;
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (usermode) {
case X86EMUL_MODE_PROT32:
- cs_sel = (u16)(msr_data + 16);
- if ((msr_data & 0xfffc) == 0x0)
- return emulate_gp(ctxt, 0);
- ss_sel = (u16)(msr_data + 24);
+ cs.selector = (u16)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ ss.selector = (u16)(msr_data + 24);
break;
case X86EMUL_MODE_PROT64:
- cs_sel = (u16)(msr_data + 32);
- if (msr_data == 0x0)
- return emulate_gp(ctxt, 0);
- ss_sel = cs_sel + 8;
- cs.d = 0;
+ cs.selector = (u16)(msr_data + 32);
+ if (msr_data == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ ss.selector = cs.selector + 8;
+ cs.db = 0;
cs.l = 1;
break;
}
- cs_sel |= SELECTOR_RPL_MASK;
- ss_sel |= SELECTOR_RPL_MASK;
+ cs.selector |= SELECTOR_RPL_MASK;
+ ss.selector |= SELECTOR_RPL_MASK;
- ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
- c->eip = c->regs[VCPU_REGS_RDX];
- c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
+ c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
+ c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
return X86EMUL_CONTINUE;
}
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
{
int iopl;
if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1796,32 +1804,32 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
if (ctxt->mode == X86EMUL_MODE_VM86)
return true;
iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- return ops->cpl(ctxt->vcpu) > iopl;
+ return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
}
static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 port, u16 len)
{
- struct kvm_desc_struct tr_seg;
+ struct kvm_segment tr_seg;
int r;
u16 io_bitmap_ptr;
u8 perm, bit_idx = port & 0x7;
unsigned mask = (1 << len) - 1;
- ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
- if (!tr_seg.p)
+ kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
+ if (tr_seg.unusable)
return false;
- if (desc_limit_scaled(&tr_seg) < 103)
+ if (tr_seg.limit < 103)
return false;
- r = ops->read_std(kvm_get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
- ctxt->vcpu, NULL);
+ r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
+ NULL);
if (r != X86EMUL_CONTINUE)
return false;
- if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
+ if (io_bitmap_ptr + port/8 > tr_seg.limit)
return false;
- r = ops->read_std(kvm_get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
- &perm, 1, ctxt->vcpu, NULL);
+ r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
+ ctxt->vcpu, NULL);
if (r != X86EMUL_CONTINUE)
return false;
if ((perm >> bit_idx) & mask)
@@ -1833,1231 +1841,118 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 port, u16 len)
{
- if (ctxt->perm_ok)
- return true;
-
- if (emulator_bad_iopl(ctxt, ops))
+ if (emulator_bad_iopl(ctxt))
if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
return false;
-
- ctxt->perm_ok = true;
-
return true;
}
-static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct tss_segment_16 *tss)
-{
- struct decode_cache *c = &ctxt->decode;
-
- tss->ip = c->eip;
- tss->flag = ctxt->eflags;
- tss->ax = c->regs[VCPU_REGS_RAX];
- tss->cx = c->regs[VCPU_REGS_RCX];
- tss->dx = c->regs[VCPU_REGS_RDX];
- tss->bx = c->regs[VCPU_REGS_RBX];
- tss->sp = c->regs[VCPU_REGS_RSP];
- tss->bp = c->regs[VCPU_REGS_RBP];
- tss->si = c->regs[VCPU_REGS_RSI];
- tss->di = c->regs[VCPU_REGS_RDI];
-
- tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
- tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
- tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
- tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
-}
-
-static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct tss_segment_16 *tss)
-{
- struct decode_cache *c = &ctxt->decode;
- int ret;
-
- c->eip = tss->ip;
- ctxt->eflags = tss->flag | 2;
- c->regs[VCPU_REGS_RAX] = tss->ax;
- c->regs[VCPU_REGS_RCX] = tss->cx;
- c->regs[VCPU_REGS_RDX] = tss->dx;
- c->regs[VCPU_REGS_RBX] = tss->bx;
- c->regs[VCPU_REGS_RSP] = tss->sp;
- c->regs[VCPU_REGS_RBP] = tss->bp;
- c->regs[VCPU_REGS_RSI] = tss->si;
- c->regs[VCPU_REGS_RDI] = tss->di;
-
- /*
- * SDM says that segment selectors are loaded before segment
- * descriptors
- */
- ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
- ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
- ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
-
- /*
- * Now load segment descriptors. If fault happenes at this stage
- * it is handled in a context of new task
- */
- ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
-
- return X86EMUL_CONTINUE;
-}
-
-static int task_switch_16(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 tss_selector, u16 old_tss_sel,
- ulong old_tss_base, struct kvm_desc_struct *new_desc)
-{
- struct tss_segment_16 tss_seg;
- int ret;
- u32 new_tss_base = kvm_get_desc_base(new_desc);
-
- ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- save_state_to_tss16(ctxt, ops, &tss_seg);
-
- ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- if (old_tss_sel != 0xffff) {
- tss_seg.prev_task_link = old_tss_sel;
-
- ret = ops->write_std(new_tss_base,
- &tss_seg.prev_task_link,
- sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
- }
-
- return load_state_from_tss16(ctxt, ops, &tss_seg);
-}
-
-static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct tss_segment_32 *tss)
-{
- struct decode_cache *c = &ctxt->decode;
-
- tss->cr3 = ops->get_cr(3, ctxt->vcpu);
- tss->eip = c->eip;
- tss->eflags = ctxt->eflags;
- tss->eax = c->regs[VCPU_REGS_RAX];
- tss->ecx = c->regs[VCPU_REGS_RCX];
- tss->edx = c->regs[VCPU_REGS_RDX];
- tss->ebx = c->regs[VCPU_REGS_RBX];
- tss->esp = c->regs[VCPU_REGS_RSP];
- tss->ebp = c->regs[VCPU_REGS_RBP];
- tss->esi = c->regs[VCPU_REGS_RSI];
- tss->edi = c->regs[VCPU_REGS_RDI];
-
- tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
- tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
- tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
- tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
- tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
- tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
-}
-
-static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct tss_segment_32 *tss)
-{
- struct decode_cache *c = &ctxt->decode;
- int ret;
-
- if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
- return emulate_gp(ctxt, 0);
- c->eip = tss->eip;
- ctxt->eflags = tss->eflags | 2;
- c->regs[VCPU_REGS_RAX] = tss->eax;
- c->regs[VCPU_REGS_RCX] = tss->ecx;
- c->regs[VCPU_REGS_RDX] = tss->edx;
- c->regs[VCPU_REGS_RBX] = tss->ebx;
- c->regs[VCPU_REGS_RSP] = tss->esp;
- c->regs[VCPU_REGS_RBP] = tss->ebp;
- c->regs[VCPU_REGS_RSI] = tss->esi;
- c->regs[VCPU_REGS_RDI] = tss->edi;
-
- /*
- * SDM says that segment selectors are loaded before segment
- * descriptors
- */
- ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
- ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
- ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
- ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
- ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
-
- /*
- * Now load segment descriptors. If fault happenes at this stage
- * it is handled in a context of new task
- */
- ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
-
- return X86EMUL_CONTINUE;
-}
-
-static int task_switch_32(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 tss_selector, u16 old_tss_sel,
- ulong old_tss_base, struct kvm_desc_struct *new_desc)
-{
- struct tss_segment_32 tss_seg;
- int ret;
- u32 new_tss_base = kvm_get_desc_base(new_desc);
-
- ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- save_state_to_tss32(ctxt, ops, &tss_seg);
-
- ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- if (old_tss_sel != 0xffff) {
- tss_seg.prev_task_link = old_tss_sel;
-
- ret = ops->write_std(new_tss_base,
- &tss_seg.prev_task_link,
- sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
- }
-
- return load_state_from_tss32(ctxt, ops, &tss_seg);
-}
-
-static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 tss_selector, int reason,
- bool has_error_code, u32 error_code)
-{
- struct kvm_desc_struct curr_tss_desc, next_tss_desc;
- int ret;
- u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
- ulong old_tss_base =
- ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
- u32 desc_limit;
-
- /* FIXME: old_tss_base == ~0 ? */
-
- ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
-
- /* FIXME: check that next_tss_desc is tss */
-
- if (reason != TASK_SWITCH_IRET) {
- if ((tss_selector & 3) > next_tss_desc.dpl ||
- ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
- return emulate_gp(ctxt, 0);
- }
-
- desc_limit = desc_limit_scaled(&next_tss_desc);
- if (!next_tss_desc.p ||
- ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
- desc_limit < 0x2b)) {
- emulate_ts(ctxt, tss_selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
-
- if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
- curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
- write_segment_descriptor(ctxt, ops, old_tss_sel,
- &curr_tss_desc);
- }
-
- if (reason == TASK_SWITCH_IRET)
- ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
-
- /* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
- if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
- old_tss_sel = 0xffff;
-
- if (next_tss_desc.type & 8)
- ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
- old_tss_base, &next_tss_desc);
- else
- ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
- old_tss_base, &next_tss_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
-
- if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
- ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
-
- if (reason != TASK_SWITCH_IRET) {
- next_tss_desc.type |= (1 << 1); /* set busy flag */
- write_segment_descriptor(ctxt, ops, tss_selector,
- &next_tss_desc);
- }
-
- ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
- ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
- ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
-
- if (has_error_code) {
- struct decode_cache *c = &ctxt->decode;
-
- c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
- c->lock_prefix = 0;
- c->src.val = (unsigned long) error_code;
- emulate_push(ctxt, ops);
- }
-
- return ret;
-}
-
-int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, int reason,
- bool has_error_code, u32 error_code)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- struct decode_cache *c = &ctxt->decode;
- int rc;
-
- c->eip = ctxt->eip;
- c->dst.type = OP_NONE;
-
- rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
- has_error_code, error_code);
-
- if (rc == X86EMUL_CONTINUE) {
- rc = writeback(ctxt, ops);
- if (rc == X86EMUL_CONTINUE)
- ctxt->eip = c->eip;
- }
-
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-}
-
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
- int reg, struct operand *op)
-{
- struct decode_cache *c = &ctxt->decode;
- int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
-
- register_address_increment(c, &c->regs[reg], df * op->bytes);
- op->addr.mem.ea = register_address(c, c->regs[reg]);
- op->addr.mem.seg = seg;
-}
-
-static int em_push(struct x86_emulate_ctxt *ctxt)
-{
- emulate_push(ctxt, ctxt->ops);
- return X86EMUL_CONTINUE;
-}
-
-static int em_das(struct x86_emulate_ctxt *ctxt)
-{
- struct decode_cache *c = &ctxt->decode;
- u8 al, old_al;
- bool af, cf, old_cf;
-
- cf = ctxt->eflags & X86_EFLAGS_CF;
- al = c->dst.val;
-
- old_al = al;
- old_cf = cf;
- cf = false;
- af = ctxt->eflags & X86_EFLAGS_AF;
- if ((al & 0x0f) > 9 || af) {
- al -= 6;
- cf = old_cf | (al >= 250);
- af = true;
- } else {
- af = false;
- }
- if (old_al > 0x99 || old_cf) {
- al -= 0x60;
- cf = true;
- }
-
- c->dst.val = al;
- /* Set PF, ZF, SF */
- c->src.type = OP_IMM;
- c->src.val = 0;
- c->src.bytes = 1;
- emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
- ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
- if (cf)
- ctxt->eflags |= X86_EFLAGS_CF;
- if (af)
- ctxt->eflags |= X86_EFLAGS_AF;
- return X86EMUL_CONTINUE;
-}
-
-static int em_call_far(struct x86_emulate_ctxt *ctxt)
-{
- struct decode_cache *c = &ctxt->decode;
- u16 sel, old_cs;
- ulong old_eip;
- int rc;
-
- old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- old_eip = c->eip;
-
- memcpy(&sel, c->src.valptr + c->op_bytes, 2);
- if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
- return X86EMUL_CONTINUE;
-
- c->eip = 0;
- memcpy(&c->eip, c->src.valptr, c->op_bytes);
-
- c->src.val = old_cs;
- emulate_push(ctxt, ctxt->ops);
- rc = writeback(ctxt, ctxt->ops);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- c->src.val = old_eip;
- emulate_push(ctxt, ctxt->ops);
- rc = writeback(ctxt, ctxt->ops);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- c->dst.type = OP_NONE;
-
- return X86EMUL_CONTINUE;
-}
-
-static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
-{
- struct decode_cache *c = &ctxt->decode;
- int rc;
-
- c->dst.type = OP_REG;
- c->dst.addr.reg = &c->eip;
- c->dst.bytes = c->op_bytes;
- rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
- return X86EMUL_CONTINUE;
-}
-
-static int em_imul(struct x86_emulate_ctxt *ctxt)
-{
- struct decode_cache *c = &ctxt->decode;
-
- emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
- return X86EMUL_CONTINUE;
-}
-
-static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
-{
- struct decode_cache *c = &ctxt->decode;
-
- c->dst.val = c->src2.val;
- return em_imul(ctxt);
-}
-
-static int em_cwd(struct x86_emulate_ctxt *ctxt)
-{
- struct decode_cache *c = &ctxt->decode;
-
- c->dst.type = OP_REG;
- c->dst.bytes = c->src.bytes;
- c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
- c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
-
- return X86EMUL_CONTINUE;
-}
-
-static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
-{
- unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
- struct decode_cache *c = &ctxt->decode;
- u64 tsc = 0;
-
- if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
- return emulate_gp(ctxt, 0);
- ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
- c->regs[VCPU_REGS_RAX] = (u32)tsc;
- c->regs[VCPU_REGS_RDX] = tsc >> 32;
- return X86EMUL_CONTINUE;
-}
-
-static int em_mov(struct x86_emulate_ctxt *ctxt)
-{
- struct decode_cache *c = &ctxt->decode;
- c->dst.val = c->src.val;
- return X86EMUL_CONTINUE;
-}
-
-#define D(_y) { .flags = (_y) }
-#define N D(0)
-#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
-#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
-#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
-
-#define D2bv(_f) D((_f) | ByteOp), D(_f)
-#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
-
-#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \
- D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \
- D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
-
-
-static struct opcode group1[] = {
- X7(D(Lock)), N
-};
-
-static struct opcode group1A[] = {
- D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
-};
-
-static struct opcode group3[] = {
- D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
- D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
- X4(D(SrcMem | ModRM)),
-};
-
-static struct opcode group4[] = {
- D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
- N, N, N, N, N, N,
-};
-
-static struct opcode group5[] = {
- D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
- D(SrcMem | ModRM | Stack),
- I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
- D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
- D(SrcMem | ModRM | Stack), N,
-};
-
-static struct group_dual group7 = { {
- N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
- D(SrcNone | ModRM | DstMem | Mov), N,
- D(SrcMem16 | ModRM | Mov | Priv),
- D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
-}, {
- D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
- D(SrcNone | ModRM | DstMem | Mov), N,
- D(SrcMem16 | ModRM | Mov | Priv), N,
-} };
-
-static struct opcode group8[] = {
- N, N, N, N,
- D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
- D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
-};
-
-static struct group_dual group9 = { {
- N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
-}, {
- N, N, N, N, N, N, N, N,
-} };
-
-static struct opcode group11[] = {
- I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
-};
-
-static struct opcode opcode_table[256] = {
- /* 0x00 - 0x07 */
- D6ALU(Lock),
- D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
- /* 0x08 - 0x0F */
- D6ALU(Lock),
- D(ImplicitOps | Stack | No64), N,
- /* 0x10 - 0x17 */
- D6ALU(Lock),
- D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
- /* 0x18 - 0x1F */
- D6ALU(Lock),
- D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
- /* 0x20 - 0x27 */
- D6ALU(Lock), N, N,
- /* 0x28 - 0x2F */
- D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
- /* 0x30 - 0x37 */
- D6ALU(Lock), N, N,
- /* 0x38 - 0x3F */
- D6ALU(0), N, N,
- /* 0x40 - 0x4F */
- X16(D(DstReg)),
- /* 0x50 - 0x57 */
- X8(I(SrcReg | Stack, em_push)),
- /* 0x58 - 0x5F */
- X8(D(DstReg | Stack)),
- /* 0x60 - 0x67 */
- D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
- N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
- N, N, N, N,
- /* 0x68 - 0x6F */
- I(SrcImm | Mov | Stack, em_push),
- I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
- I(SrcImmByte | Mov | Stack, em_push),
- I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
- D2bv(DstDI | Mov | String), /* insb, insw/insd */
- D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
- /* 0x70 - 0x7F */
- X16(D(SrcImmByte)),
- /* 0x80 - 0x87 */
- G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
- G(DstMem | SrcImm | ModRM | Group, group1),
- G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
- G(DstMem | SrcImmByte | ModRM | Group, group1),
- D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
- /* 0x88 - 0x8F */
- I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
- I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
- D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
- D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
- /* 0x90 - 0x97 */
- X8(D(SrcAcc | DstReg)),
- /* 0x98 - 0x9F */
- D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
- I(SrcImmFAddr | No64, em_call_far), N,
- D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
- /* 0xA0 - 0xA7 */
- I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
- I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
- I2bv(SrcSI | DstDI | Mov | String, em_mov),
- D2bv(SrcSI | DstDI | String),
- /* 0xA8 - 0xAF */
- D2bv(DstAcc | SrcImm),
- I2bv(SrcAcc | DstDI | Mov | String, em_mov),
- I2bv(SrcSI | DstAcc | Mov | String, em_mov),
- D2bv(SrcAcc | DstDI | String),
- /* 0xB0 - 0xB7 */
- X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
- /* 0xB8 - 0xBF */
- X8(I(DstReg | SrcImm | Mov, em_mov)),
- /* 0xC0 - 0xC7 */
- D2bv(DstMem | SrcImmByte | ModRM),
- I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
- D(ImplicitOps | Stack),
- D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
- G(ByteOp, group11), G(0, group11),
- /* 0xC8 - 0xCF */
- N, N, N, D(ImplicitOps | Stack),
- D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
- /* 0xD0 - 0xD7 */
- D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
- N, N, N, N,
- /* 0xD8 - 0xDF */
- N, N, N, N, N, N, N, N,
- /* 0xE0 - 0xE7 */
- X4(D(SrcImmByte)),
- D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
- /* 0xE8 - 0xEF */
- D(SrcImm | Stack), D(SrcImm | ImplicitOps),
- D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
- D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps),
- /* 0xF0 - 0xF7 */
- N, N, N, N,
- D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
- /* 0xF8 - 0xFF */
- D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
- D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
-};
-
-static struct opcode twobyte_table[256] = {
- /* 0x00 - 0x0F */
- N, GD(0, &group7), N, N,
- N, D(ImplicitOps), D(ImplicitOps | Priv), N,
- D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
- N, D(ImplicitOps | ModRM), N, N,
- /* 0x10 - 0x1F */
- N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
- /* 0x20 - 0x2F */
- D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
- D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
- N, N, N, N,
- N, N, N, N, N, N, N, N,
- /* 0x30 - 0x3F */
- D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
- D(ImplicitOps | Priv), N,
- D(ImplicitOps), D(ImplicitOps | Priv), N, N,
- N, N, N, N, N, N, N, N,
- /* 0x40 - 0x4F */
- X16(D(DstReg | SrcMem | ModRM | Mov)),
- /* 0x50 - 0x5F */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
- /* 0x60 - 0x6F */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
- /* 0x70 - 0x7F */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
- /* 0x80 - 0x8F */
- X16(D(SrcImm)),
- /* 0x90 - 0x9F */
- X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
- /* 0xA0 - 0xA7 */
- D(ImplicitOps | Stack), D(ImplicitOps | Stack),
- N, D(DstMem | SrcReg | ModRM | BitOp),
- D(DstMem | SrcReg | Src2ImmByte | ModRM),
- D(DstMem | SrcReg | Src2CL | ModRM), N, N,
- /* 0xA8 - 0xAF */
- D(ImplicitOps | Stack), D(ImplicitOps | Stack),
- N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
- D(DstMem | SrcReg | Src2ImmByte | ModRM),
- D(DstMem | SrcReg | Src2CL | ModRM),
- D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
- /* 0xB0 - 0xB7 */
- D2bv(DstMem | SrcReg | ModRM | Lock),
- D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
- D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
- D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
- /* 0xB8 - 0xBF */
- N, N,
- G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
- D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
- D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
- /* 0xC0 - 0xCF */
- D2bv(DstMem | SrcReg | ModRM | Lock),
- N, D(DstMem | SrcReg | ModRM | Mov),
- N, N, N, GD(0, &group9),
- N, N, N, N, N, N, N, N,
- /* 0xD0 - 0xDF */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
- /* 0xE0 - 0xEF */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
- /* 0xF0 - 0xFF */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
-};
-
-#undef D
-#undef N
-#undef G
-#undef GD
-#undef I
-
-#undef D2bv
-#undef I2bv
-#undef D6ALU
-
-static unsigned imm_size(struct decode_cache *c)
-{
- unsigned size;
-
- size = (c->d & ByteOp) ? 1 : c->op_bytes;
- if (size == 8)
- size = 4;
- return size;
-}
-
-static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
- unsigned size, bool sign_extension)
-{
- struct decode_cache *c = &ctxt->decode;
- struct x86_emulate_ops *ops = ctxt->ops;
- int rc = X86EMUL_CONTINUE;
-
- op->type = OP_IMM;
- op->bytes = size;
- op->addr.mem.ea = c->eip;
- /* NB. Immediates are sign-extended as necessary. */
- switch (op->bytes) {
- case 1:
- op->val = insn_fetch(s8, 1, c->eip);
- break;
- case 2:
- op->val = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- op->val = insn_fetch(s32, 4, c->eip);
- break;
- }
- if (!sign_extension) {
- switch (op->bytes) {
- case 1:
- op->val &= 0xff;
- break;
- case 2:
- op->val &= 0xffff;
- break;
- case 4:
- op->val &= 0xffffffff;
- break;
- }
- }
-done:
- return rc;
-}
-
int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ unsigned long memop = 0;
+ u64 msr_data;
+ unsigned long saved_eip = 0;
struct decode_cache *c = &ctxt->decode;
- int rc = X86EMUL_CONTINUE;
- int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes, dual, goffset;
- struct opcode opcode, *g_mod012, *g_mod3;
- struct operand memop = { .type = OP_NONE };
-
- c->eip = ctxt->eip;
- c->fetch.start = c->eip;
- c->fetch.end = c->fetch.start + insn_len;
- if (insn_len > 0)
- memcpy(c->fetch.data, insn, insn_len);
- ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
-
- switch (mode) {
- case X86EMUL_MODE_REAL:
- case X86EMUL_MODE_VM86:
- case X86EMUL_MODE_PROT16:
- def_op_bytes = def_ad_bytes = 2;
- break;
- case X86EMUL_MODE_PROT32:
- def_op_bytes = def_ad_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86EMUL_MODE_PROT64:
- def_op_bytes = 4;
- def_ad_bytes = 8;
- break;
-#endif
- default:
- return -1;
- }
-
- c->op_bytes = def_op_bytes;
- c->ad_bytes = def_ad_bytes;
-
- /* Legacy prefixes. */
- for (;;) {
- switch (c->b = insn_fetch(u8, 1, c->eip)) {
- case 0x66: /* operand-size override */
- /* switch between 2/4 bytes */
- c->op_bytes = def_op_bytes ^ 6;
- break;
- case 0x67: /* address-size override */
- if (mode == X86EMUL_MODE_PROT64)
- /* switch between 4/8 bytes */
- c->ad_bytes = def_ad_bytes ^ 12;
- else
- /* switch between 2/4 bytes */
- c->ad_bytes = def_ad_bytes ^ 6;
- break;
- case 0x26: /* ES override */
- case 0x2e: /* CS override */
- case 0x36: /* SS override */
- case 0x3e: /* DS override */
- set_seg_override(c, (c->b >> 3) & 3);
- break;
- case 0x64: /* FS override */
- case 0x65: /* GS override */
- set_seg_override(c, c->b & 7);
- break;
- case 0x40 ... 0x4f: /* REX */
- if (mode != X86EMUL_MODE_PROT64)
- goto done_prefixes;
- c->rex_prefix = c->b;
- continue;
- case 0xf0: /* LOCK */
- c->lock_prefix = 1;
- break;
- case 0xf2: /* REPNE/REPNZ */
- c->rep_prefix = REPNE_PREFIX;
- break;
- case 0xf3: /* REP/REPE/REPZ */
- c->rep_prefix = REPE_PREFIX;
- break;
- default:
- goto done_prefixes;
- }
-
- /* Any legacy prefix after a REX prefix nullifies its effect. */
-
- c->rex_prefix = 0;
- }
-
-done_prefixes:
-
- /* REX prefix. */
- if (c->rex_prefix & 8)
- c->op_bytes = 8; /* REX.W */
-
- /* Opcode byte(s). */
- opcode = opcode_table[c->b];
- /* Two-byte opcode? */
- if (c->b == 0x0f) {
- c->twobyte = 1;
- c->b = insn_fetch(u8, 1, c->eip);
- opcode = twobyte_table[c->b];
- }
- c->d = opcode.flags;
-
- if (c->d & Group) {
- dual = c->d & GroupDual;
- c->modrm = insn_fetch(u8, 1, c->eip);
- --c->eip;
-
- if (c->d & GroupDual) {
- g_mod012 = opcode.u.gdual->mod012;
- g_mod3 = opcode.u.gdual->mod3;
- } else
- g_mod012 = g_mod3 = opcode.u.group;
-
- c->d &= ~(Group | GroupDual);
-
- goffset = (c->modrm >> 3) & 7;
-
- if ((c->modrm >> 6) == 3)
- opcode = g_mod3[goffset];
- else
- opcode = g_mod012[goffset];
- c->d |= opcode.flags;
- }
-
- c->execute = opcode.u.execute;
-
- /* Unrecognised? */
- if (c->d == 0 || (c->d & Undefined))
- return -1;
-
- if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
- c->op_bytes = 8;
-
- if (c->d & Op3264) {
- if (mode == X86EMUL_MODE_PROT64)
- c->op_bytes = 8;
- else
- c->op_bytes = 4;
- }
-
- /* ModRM and SIB bytes. */
- if (c->d & ModRM) {
- rc = decode_modrm(ctxt, ops, &memop);
- if (!c->has_seg_override)
- set_seg_override(c, c->modrm_seg);
- } else if (c->d & MemAbs)
- rc = decode_abs(ctxt, ops, &memop);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- if (!c->has_seg_override)
- set_seg_override(c, VCPU_SREG_DS);
-
- memop.addr.mem.seg = seg_override(ctxt, ops, c);
-
- if (memop.type == OP_MEM && c->ad_bytes != 8)
- memop.addr.mem.ea = (u32)memop.addr.mem.ea;
-
- if (memop.type == OP_MEM && c->rip_relative)
- memop.addr.mem.ea += c->eip;
-
- /*
- * Decode and fetch the source operand: register, memory
- * or immediate.
- */
- switch (c->d & SrcMask) {
- case SrcNone:
- break;
- case SrcReg:
- decode_register_operand(&c->src, c, 0);
- break;
- case SrcMem16:
- memop.bytes = 2;
- goto srcmem_common;
- case SrcMem32:
- memop.bytes = 4;
- goto srcmem_common;
- case SrcMem:
- memop.bytes = (c->d & ByteOp) ? 1 :
- c->op_bytes;
- srcmem_common:
- c->src = memop;
- break;
- case SrcImmU16:
- rc = decode_imm(ctxt, &c->src, 2, false);
- break;
- case SrcImm:
- rc = decode_imm(ctxt, &c->src, imm_size(c), true);
- break;
- case SrcImmU:
- rc = decode_imm(ctxt, &c->src, imm_size(c), false);
- break;
- case SrcImmByte:
- rc = decode_imm(ctxt, &c->src, 1, true);
- break;
- case SrcImmUByte:
- rc = decode_imm(ctxt, &c->src, 1, false);
- break;
- case SrcAcc:
- c->src.type = OP_REG;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
- fetch_register_operand(&c->src);
- break;
- case SrcOne:
- c->src.bytes = 1;
- c->src.val = 1;
- break;
- case SrcSI:
- c->src.type = OP_MEM;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.addr.mem.ea =
- register_address(c, c->regs[VCPU_REGS_RSI]);
- c->src.addr.mem.seg = seg_override(ctxt, ops, c),
- c->src.val = 0;
- break;
- case SrcImmFAddr:
- c->src.type = OP_IMM;
- c->src.addr.mem.ea = c->eip;
- c->src.bytes = c->op_bytes + 2;
- insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
- break;
- case SrcMemFAddr:
- memop.bytes = c->op_bytes + 2;
- goto srcmem_common;
- break;
- }
+ unsigned int port;
+ int io_dir_in;
+ int rc = 0;
- if (rc != X86EMUL_CONTINUE)
- goto done;
+ ctxt->interruptibility = 0;
- /*
- * Decode and fetch the second source operand: register, memory
- * or immediate.
+ /* Shadow copy of register state. Committed on successful emulation.
+ * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
+ * modify them.
*/
- switch (c->d & Src2Mask) {
- case Src2None:
- break;
- case Src2CL:
- c->src2.bytes = 1;
- c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
- break;
- case Src2ImmByte:
- rc = decode_imm(ctxt, &c->src2, 1, true);
- break;
- case Src2One:
- c->src2.bytes = 1;
- c->src2.val = 1;
- break;
- case Src2Imm:
- rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
- break;
- }
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- /* Decode and fetch the destination operand: register or memory. */
- switch (c->d & DstMask) {
- case DstReg:
- decode_register_operand(&c->dst, c,
- c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
- break;
- case DstImmUByte:
- c->dst.type = OP_IMM;
- c->dst.addr.mem.ea = c->eip;
- c->dst.bytes = 1;
- c->dst.val = insn_fetch(u8, 1, c->eip);
- break;
- case DstMem:
- case DstMem64:
- c->dst = memop;
- if ((c->d & DstMask) == DstMem64)
- c->dst.bytes = 8;
- else
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- if (c->d & BitOp)
- fetch_bit_operand(c);
- c->dst.orig_val = c->dst.val;
- break;
- case DstAcc:
- c->dst.type = OP_REG;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
- fetch_register_operand(&c->dst);
- c->dst.orig_val = c->dst.val;
- break;
- case DstDI:
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.addr.mem.ea =
- register_address(c, c->regs[VCPU_REGS_RDI]);
- c->dst.addr.mem.seg = VCPU_SREG_ES;
- c->dst.val = 0;
- break;
- case ImplicitOps:
- /* Special instructions do their own operand decoding. */
- default:
- c->dst.type = OP_NONE; /* Disable writeback. */
- return 0;
- }
-
-done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-}
-
-static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
-{
- struct decode_cache *c = &ctxt->decode;
-
- /* The second termination condition only applies for REPE
- * and REPNE. Test if the repeat string operation prefix is
- * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
- * corresponding termination condition according to:
- * - if REPE/REPZ and ZF = 0 then done
- * - if REPNE/REPNZ and ZF = 1 then done
- */
- if (((c->b == 0xa6) || (c->b == 0xa7) ||
- (c->b == 0xae) || (c->b == 0xaf))
- && (((c->rep_prefix == REPE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == 0))
- || ((c->rep_prefix == REPNE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
- return true;
-
- return false;
-}
-
-int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- u64 msr_data;
- struct decode_cache *c = &ctxt->decode;
- int rc = X86EMUL_CONTINUE;
- int saved_dst_type = c->dst.type;
- int irq; /* Used for int 3, int, and into */
-
- ctxt->decode.mem_read.pos = 0;
-
- if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- rc = emulate_ud(ctxt);
- goto done;
- }
+ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ saved_eip = c->eip;
/* LOCK prefix is allowed only with some instructions */
- if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
- if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
- rc = emulate_ud(ctxt);
+ if (c->lock_prefix && !(c->d & Lock)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
goto done;
}
/* Privileged instruction can be executed only in CPL=0 */
- if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
- rc = emulate_gp(ctxt, 0);
+ if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
goto done;
}
+ if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
+ memop = c->modrm_ea;
+
if (c->rep_prefix && (c->d & String)) {
/* All REP prefixes have the same first termination condition */
- if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
- ctxt->eip = c->eip;
+ if (c->regs[VCPU_REGS_RCX] == 0) {
+ kvm_rip_write(ctxt->vcpu, c->eip);
goto done;
}
+ /* The second termination condition only applies for REPE
+ * and REPNE. Test if the repeat string operation prefix is
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+ * corresponding termination condition according to:
+ * - if REPE/REPZ and ZF = 0 then done
+ * - if REPNE/REPNZ and ZF = 1 then done
+ */
+ if ((c->b == 0xa6) || (c->b == 0xa7) ||
+ (c->b == 0xae) || (c->b == 0xaf)) {
+ if ((c->rep_prefix == REPE_PREFIX) &&
+ ((ctxt->eflags & EFLG_ZF) == 0)) {
+ kvm_rip_write(ctxt->vcpu, c->eip);
+ goto done;
+ }
+ if ((c->rep_prefix == REPNE_PREFIX) &&
+ ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
+ kvm_rip_write(ctxt->vcpu, c->eip);
+ goto done;
+ }
+ }
+ c->regs[VCPU_REGS_RCX]--;
+ c->eip = kvm_rip_read(ctxt->vcpu);
}
- if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
- rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
- c->src.valptr, c->src.bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- c->src.orig_val64 = c->src.val64;
- }
-
- if (c->src2.type == OP_MEM) {
- rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
- &c->src2.val, c->src2.bytes);
+ if (c->src.type == OP_MEM) {
+ c->src.ptr = (unsigned long *)memop;
+ c->src.val = 0;
+ rc = ops->read_emulated((unsigned long)c->src.ptr,
+ &c->src.val,
+ c->src.bytes,
+ ctxt->vcpu);
if (rc != X86EMUL_CONTINUE)
goto done;
+ c->src.orig_val = c->src.val;
}
if ((c->d & DstMask) == ImplicitOps)
goto special_insn;
- if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
- /* optimisation - avoid slow emulated read if Mov */
- rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
- &c->dst.val, c->dst.bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
+ if (c->dst.type == OP_MEM) {
+ c->dst.ptr = (unsigned long *)memop;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.val = 0;
+ if (c->d & BitOp) {
+ unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+ c->dst.ptr = (void *)c->dst.ptr +
+ (c->src.val & mask) / 8;
+ }
+ if (!(c->d & Mov)) {
+ /* optimisation - avoid slow emulated read */
+ rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
}
c->dst.orig_val = c->dst.val;
special_insn:
- if (c->execute) {
- rc = c->execute(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- goto writeback;
- }
-
if (c->twobyte)
goto twobyte_insn;
@@ -3067,37 +1962,43 @@ special_insn:
emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
break;
case 0x06: /* push es */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
+ emulate_push_sreg(ctxt, VCPU_SREG_ES);
break;
case 0x07: /* pop es */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
+ if (rc != 0)
+ goto done;
break;
case 0x08 ... 0x0d:
or: /* or */
emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
break;
case 0x0e: /* push cs */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
+ emulate_push_sreg(ctxt, VCPU_SREG_CS);
break;
case 0x10 ... 0x15:
adc: /* adc */
emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
break;
case 0x16: /* push ss */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
+ emulate_push_sreg(ctxt, VCPU_SREG_SS);
break;
case 0x17: /* pop ss */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
+ if (rc != 0)
+ goto done;
break;
case 0x18 ... 0x1d:
sbb: /* sbb */
emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
break;
case 0x1e: /* push ds */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
+ emulate_push_sreg(ctxt, VCPU_SREG_DS);
break;
case 0x1f: /* pop ds */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
+ if (rc != 0)
+ goto done;
break;
case 0x20 ... 0x25:
and: /* and */
@@ -3121,30 +2022,75 @@ special_insn:
case 0x48 ... 0x4f: /* dec r16/r32 */
emulate_1op("dec", c->dst, ctxt->eflags);
break;
+ case 0x50 ... 0x57: /* push reg */
+ emulate_push(ctxt);
+ break;
case 0x58 ... 0x5f: /* pop reg */
pop_instruction:
rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
+ if (rc != 0)
+ goto done;
break;
case 0x60: /* pusha */
- rc = emulate_pusha(ctxt, ops);
+ emulate_pusha(ctxt);
break;
case 0x61: /* popa */
rc = emulate_popa(ctxt, ops);
+ if (rc != 0)
+ goto done;
break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
goto cannot_emulate;
c->dst.val = (s32) c->src.val;
break;
+ case 0x68: /* push imm */
+ case 0x6a: /* push imm8 */
+ emulate_push(ctxt);
+ break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
- c->src.val = c->regs[VCPU_REGS_RDX];
- goto do_io_in;
+ if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (kvm_emulate_pio_string(ctxt->vcpu,
+ 1,
+ (c->d & ByteOp) ? 1 : c->op_bytes,
+ c->rep_prefix ?
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
+ (ctxt->eflags & EFLG_DF),
+ register_address(c, es_base(ctxt),
+ c->regs[VCPU_REGS_RDI]),
+ c->rep_prefix,
+ c->regs[VCPU_REGS_RDX]) == 0) {
+ c->eip = saved_eip;
+ return -1;
+ }
+ return 0;
case 0x6e: /* outsb */
case 0x6f: /* outsw/outsd */
- c->dst.val = c->regs[VCPU_REGS_RDX];
- goto do_io_out;
- break;
+ if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (kvm_emulate_pio_string(ctxt->vcpu,
+ 0,
+ (c->d & ByteOp) ? 1 : c->op_bytes,
+ c->rep_prefix ?
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
+ (ctxt->eflags & EFLG_DF),
+ register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
+ c->rep_prefix,
+ c->regs[VCPU_REGS_RDX]) == 0) {
+ c->eip = saved_eip;
+ return -1;
+ }
+ return 0;
case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(c->b, ctxt->eflags))
jmp_rel(c, c->src.val);
@@ -3170,30 +2116,49 @@ special_insn:
}
break;
case 0x84 ... 0x85:
- test:
emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
break;
case 0x86 ... 0x87: /* xchg */
xchg:
/* Write back the register source. */
- c->src.val = c->dst.val;
- write_register_operand(&c->src);
+ switch (c->dst.bytes) {
+ case 1:
+ *(u8 *) c->src.ptr = (u8) c->dst.val;
+ break;
+ case 2:
+ *(u16 *) c->src.ptr = (u16) c->dst.val;
+ break;
+ case 4:
+ *c->src.ptr = (u32) c->dst.val;
+ break; /* 64b reg: zero-extend */
+ case 8:
+ *c->src.ptr = c->dst.val;
+ break;
+ }
/*
* Write back the memory destination with implicit LOCK
* prefix.
*/
- c->dst.val = c->src.orig_val;
+ c->dst.val = c->src.val;
c->lock_prefix = 1;
break;
- case 0x8c: /* mov r/m, sreg */
- if (c->modrm_reg > VCPU_SREG_GS) {
- rc = emulate_ud(ctxt);
- goto done;
+ case 0x88 ... 0x8b: /* mov */
+ goto mov;
+ case 0x8c: { /* mov r/m, sreg */
+ struct kvm_segment segreg;
+
+ if (c->modrm_reg <= 5)
+ kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
+ else {
+ printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
+ c->modrm);
+ goto cannot_emulate;
}
- c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
+ c->dst.val = segreg.selector;
break;
+ }
case 0x8d: /* lea r16/r32, m */
- c->dst.val = c->src.addr.mem.ea;
+ c->dst.val = c->modrm_ea;
break;
case 0x8e: { /* mov seg, r/m16 */
uint16_t sel;
@@ -3202,127 +2167,193 @@ special_insn:
if (c->modrm_reg == VCPU_SREG_CS ||
c->modrm_reg > VCPU_SREG_GS) {
- rc = emulate_ud(ctxt);
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
goto done;
}
if (c->modrm_reg == VCPU_SREG_SS)
- ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
- rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
}
case 0x8f: /* pop (sole member of Grp1a) */
rc = emulate_grp1a(ctxt, ops);
+ if (rc != 0)
+ goto done;
break;
- case 0x90 ... 0x97: /* nop / xchg reg, rax */
- if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
+ case 0x90: /* nop / xchg r8,rax */
+ if (!(c->rex_prefix & 1)) { /* nop */
+ c->dst.type = OP_NONE;
break;
- goto xchg;
- case 0x98: /* cbw/cwde/cdqe */
- switch (c->op_bytes) {
- case 2: c->dst.val = (s8)c->dst.val; break;
- case 4: c->dst.val = (s16)c->dst.val; break;
- case 8: c->dst.val = (s32)c->dst.val; break;
}
- break;
+ case 0x91 ... 0x97: /* xchg reg,rax */
+ c->src.type = c->dst.type = OP_REG;
+ c->src.bytes = c->dst.bytes = c->op_bytes;
+ c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
+ c->src.val = *(c->src.ptr);
+ goto xchg;
case 0x9c: /* pushf */
c->src.val = (unsigned long) ctxt->eflags;
- emulate_push(ctxt, ops);
+ emulate_push(ctxt);
break;
case 0x9d: /* popf */
c->dst.type = OP_REG;
- c->dst.addr.reg = &ctxt->eflags;
+ c->dst.ptr = (unsigned long *) &ctxt->eflags;
c->dst.bytes = c->op_bytes;
rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ break;
+ case 0xa0 ... 0xa1: /* mov */
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ c->dst.val = c->src.val;
+ break;
+ case 0xa2 ... 0xa3: /* mov */
+ c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
+ break;
+ case 0xa4 ... 0xa5: /* movs */
+ c->dst.type = OP_MEM;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)register_address(c,
+ es_base(ctxt),
+ c->regs[VCPU_REGS_RDI]);
+ rc = ops->read_emulated(register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
+ &c->dst.val,
+ c->dst.bytes, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
break;
case 0xa6 ... 0xa7: /* cmps */
+ c->src.type = OP_NONE; /* Disable writeback. */
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.ptr = (unsigned long *)register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]);
+ rc = ops->read_emulated((unsigned long)c->src.ptr,
+ &c->src.val,
+ c->src.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
c->dst.type = OP_NONE; /* Disable writeback. */
- goto cmp;
- case 0xa8 ... 0xa9: /* test ax, imm */
- goto test;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)register_address(c,
+ es_base(ctxt),
+ c->regs[VCPU_REGS_RDI]);
+ rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
+
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
+ (ctxt->eflags & EFLG_DF) ? -c->src.bytes
+ : c->src.bytes);
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+
+ break;
+ case 0xaa ... 0xab: /* stos */
+ c->dst.type = OP_MEM;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)register_address(c,
+ es_base(ctxt),
+ c->regs[VCPU_REGS_RDI]);
+ c->dst.val = c->regs[VCPU_REGS_RAX];
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ break;
+ case 0xac ... 0xad: /* lods */
+ c->dst.type = OP_REG;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ rc = ops->read_emulated(register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ break;
case 0xae ... 0xaf: /* scas */
- goto cmp;
+ DPRINTF("Urk! I don't handle SCAS.\n");
+ goto cannot_emulate;
+ case 0xb0 ... 0xbf: /* mov r, imm */
+ goto mov;
case 0xc0 ... 0xc1:
emulate_grp2(ctxt);
break;
case 0xc3: /* ret */
c->dst.type = OP_REG;
- c->dst.addr.reg = &c->eip;
+ c->dst.ptr = &c->eip;
c->dst.bytes = c->op_bytes;
goto pop_instruction;
- case 0xc4: /* les */
- rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
- break;
- case 0xc5: /* lds */
- rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
+ case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
+ mov:
+ c->dst.val = c->src.val;
break;
case 0xcb: /* ret far */
rc = emulate_ret_far(ctxt, ops);
- break;
- case 0xcc: /* int3 */
- irq = 3;
- goto do_interrupt;
- case 0xcd: /* int n */
- irq = c->src.val;
- do_interrupt:
- rc = emulate_int(ctxt, ops, irq);
- break;
- case 0xce: /* into */
- if (ctxt->eflags & EFLG_OF) {
- irq = 4;
- goto do_interrupt;
- }
- break;
- case 0xcf: /* iret */
- rc = emulate_iret(ctxt, ops);
+ if (rc)
+ goto done;
break;
case 0xd0 ... 0xd1: /* Grp2 */
+ c->src.val = 1;
emulate_grp2(ctxt);
break;
case 0xd2 ... 0xd3: /* Grp2 */
c->src.val = c->regs[VCPU_REGS_RCX];
emulate_grp2(ctxt);
break;
- case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
- register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
- if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
- (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
- jmp_rel(c, c->src.val);
- break;
- case 0xe3: /* jcxz/jecxz/jrcxz */
- if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
- jmp_rel(c, c->src.val);
- break;
case 0xe4: /* inb */
case 0xe5: /* in */
- goto do_io_in;
+ port = c->src.val;
+ io_dir_in = 1;
+ goto do_io;
case 0xe6: /* outb */
case 0xe7: /* out */
- goto do_io_out;
+ port = c->src.val;
+ io_dir_in = 0;
+ goto do_io;
case 0xe8: /* call (near) */ {
long int rel = c->src.val;
c->src.val = (unsigned long) c->eip;
jmp_rel(c, rel);
- emulate_push(ctxt, ops);
+ emulate_push(ctxt);
break;
}
case 0xe9: /* jmp rel */
goto jmp;
- case 0xea: { /* jmp far */
- unsigned short sel;
- jump_far:
- memcpy(&sel, c->src.valptr + c->op_bytes, 2);
-
- if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
+ case 0xea: /* jmp far */
+ if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
+ VCPU_SREG_CS))
goto done;
- c->eip = 0;
- memcpy(&c->eip, c->src.valptr, c->op_bytes);
+ c->eip = c->src.val;
break;
- }
case 0xeb:
jmp: /* jmp rel short */
jmp_rel(c, c->src.val);
@@ -3330,30 +2361,25 @@ special_insn:
break;
case 0xec: /* in al,dx */
case 0xed: /* in (e/r)ax,dx */
- c->src.val = c->regs[VCPU_REGS_RDX];
- do_io_in:
- c->dst.bytes = min(c->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- rc = emulate_gp(ctxt, 0);
+ port = c->regs[VCPU_REGS_RDX];
+ io_dir_in = 1;
+ goto do_io;
+ case 0xee: /* out al,dx */
+ case 0xef: /* out (e/r)ax,dx */
+ port = c->regs[VCPU_REGS_RDX];
+ io_dir_in = 0;
+ do_io:
+ if (!emulator_io_permited(ctxt, ops, port,
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
goto done;
}
- if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
- &c->dst.val))
- goto done; /* IO is needed */
- break;
- case 0xee: /* out dx,al */
- case 0xef: /* out dx,(e/r)ax */
- c->dst.val = c->regs[VCPU_REGS_RDX];
- do_io_out:
- c->src.bytes = min(c->src.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->dst.val,
- c->src.bytes)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
+ if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
+ (c->d & ByteOp) ? 1 : c->op_bytes,
+ port) != 0) {
+ c->eip = saved_eip;
+ goto cannot_emulate;
}
- ops->pio_out_emulated(c->src.bytes, c->dst.val,
- &c->src.val, 1, ctxt->vcpu);
- c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
ctxt->vcpu->arch.halt_request = 1;
@@ -3361,101 +2387,64 @@ special_insn:
case 0xf5: /* cmc */
/* complement carry flag from eflags reg */
ctxt->eflags ^= EFLG_CF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf6 ... 0xf7: /* Grp3 */
rc = emulate_grp3(ctxt, ops);
+ if (rc != 0)
+ goto done;
break;
case 0xf8: /* clc */
ctxt->eflags &= ~EFLG_CF;
- break;
- case 0xf9: /* stc */
- ctxt->eflags |= EFLG_CF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfa: /* cli */
- if (emulator_bad_iopl(ctxt, ops)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- } else
+ if (emulator_bad_iopl(ctxt))
+ kvm_inject_gp(ctxt->vcpu, 0);
+ else {
ctxt->eflags &= ~X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
break;
case 0xfb: /* sti */
- if (emulator_bad_iopl(ctxt, ops)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- } else {
- ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+ if (emulator_bad_iopl(ctxt))
+ kvm_inject_gp(ctxt->vcpu, 0);
+ else {
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
ctxt->eflags |= X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
}
break;
case 0xfc: /* cld */
ctxt->eflags &= ~EFLG_DF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfd: /* std */
ctxt->eflags |= EFLG_DF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
break;
- case 0xfe: /* Grp4 */
- grp45:
+ case 0xfe ... 0xff: /* Grp4/Grp5 */
rc = emulate_grp45(ctxt, ops);
+ if (rc != 0)
+ goto done;
break;
- case 0xff: /* Grp5 */
- if (c->modrm_reg == 5)
- goto jump_far;
- goto grp45;
- default:
- goto cannot_emulate;
}
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
writeback:
rc = writeback(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
+ if (rc != 0)
goto done;
- /*
- * restore dst type in case the decoding will be reused
- * (happens for string instruction )
- */
- c->dst.type = saved_dst_type;
-
- if ((c->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override(ctxt, ops, c),
- VCPU_REGS_RSI, &c->src);
-
- if ((c->d & DstMask) == DstDI)
- string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
- &c->dst);
-
- if (c->rep_prefix && (c->d & String)) {
- struct read_cache *r = &ctxt->decode.io_read;
- register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-
- if (!string_insn_completed(ctxt)) {
- /*
- * Re-enter guest when pio read ahead buffer is empty
- * or, if it is not used, after each 1024 iteration.
- */
- if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
- (r->end == 0 || r->end != r->pos)) {
- /*
- * Reset read cache. Usually happens before
- * decode, but since instruction is restarted
- * we have to do it here.
- */
- ctxt->decode.mem_read.end = 0;
- return EMULATION_RESTART;
- }
- goto done; /* skip rip writeback */
- }
- }
-
- ctxt->eip = c->eip;
+ /* Commit shadow register state. */
+ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(ctxt->vcpu, c->eip);
done:
- if (rc == X86EMUL_PROPAGATE_FAULT)
- ctxt->have_exception = true;
- return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+ if (rc == X86EMUL_UNHANDLEABLE) {
+ c->eip = saved_eip;
+ return -1;
+ }
+ return 0;
twobyte_insn:
switch (c->b) {
@@ -3469,18 +2458,18 @@ twobyte_insn:
goto cannot_emulate;
rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
+ if (rc)
goto done;
/* Let the processor re-execute the fixed hypercall */
- c->eip = ctxt->eip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
case 2: /* lgdt */
- rc = read_descriptor(ctxt, ops, c->src.addr.mem,
+ rc = read_descriptor(ctxt, ops, c->src.ptr,
&size, &address, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
+ if (rc)
goto done;
realmode_lgdt(ctxt->vcpu, size, address);
/* Disable writeback. */
@@ -3491,15 +2480,17 @@ twobyte_insn:
switch (c->modrm_rm) {
case 1:
rc = kvm_fix_hypercall(ctxt->vcpu);
+ if (rc)
+ goto done;
break;
default:
goto cannot_emulate;
}
} else {
- rc = read_descriptor(ctxt, ops, c->src.addr.mem,
+ rc = read_descriptor(ctxt, ops, c->src.ptr,
&size, &address,
c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
+ if (rc)
goto done;
realmode_lidt(ctxt->vcpu, size, address);
}
@@ -3508,20 +2499,15 @@ twobyte_insn:
break;
case 4: /* smsw */
c->dst.bytes = 2;
- c->dst.val = ops->get_cr(0, ctxt->vcpu);
+ c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
break;
case 6: /* lmsw */
- ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
- (c->src.val & 0x0f), ctxt->vcpu);
+ realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
+ &ctxt->eflags);
c->dst.type = OP_NONE;
break;
- case 5: /* not defined */
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu,
- linear(ctxt, c->src.addr.mem));
+ emulate_invlpg(ctxt->vcpu, memop);
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
@@ -3530,93 +2516,91 @@ twobyte_insn:
}
break;
case 0x05: /* syscall */
- rc = emulate_syscall(ctxt, ops);
+ rc = emulate_syscall(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ else
+ goto writeback;
break;
case 0x06:
emulate_clts(ctxt->vcpu);
- break;
- case 0x09: /* wbinvd */
- kvm_emulate_wbinvd(ctxt->vcpu);
+ c->dst.type = OP_NONE;
break;
case 0x08: /* invd */
+ case 0x09: /* wbinvd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
+ c->dst.type = OP_NONE;
break;
case 0x20: /* mov cr, reg */
- switch (c->modrm_reg) {
- case 1:
- case 5 ... 7:
- case 9 ... 15:
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
- c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ c->regs[c->modrm_rm] =
+ realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+ c->dst.type = OP_NONE; /* no writeback */
break;
case 0x21: /* mov from dr to reg */
- if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
- (c->modrm_reg == 4 || c->modrm_reg == 5)) {
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
- ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+ if (rc)
+ goto cannot_emulate;
+ c->dst.type = OP_NONE; /* no writeback */
break;
case 0x22: /* mov reg, cr */
- if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
- emulate_gp(ctxt, 0);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ realmode_set_cr(ctxt->vcpu,
+ c->modrm_reg, c->modrm_val, &ctxt->eflags);
c->dst.type = OP_NONE;
break;
case 0x23: /* mov from reg to dr */
- if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
- (c->modrm_reg == 4 || c->modrm_reg == 5)) {
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
-
- if (ops->set_dr(c->modrm_reg, c->src.val &
- ((ctxt->mode == X86EMUL_MODE_PROT64) ?
- ~0ULL : ~0U), ctxt->vcpu) < 0) {
- /* #UD condition is already handled by the code above */
- emulate_gp(ctxt, 0);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
-
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ rc = emulator_set_dr(ctxt, c->modrm_reg,
+ c->regs[c->modrm_rm]);
+ if (rc)
+ goto cannot_emulate;
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x30:
/* wrmsr */
msr_data = (u32)c->regs[VCPU_REGS_RAX]
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
- if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
- emulate_gp(ctxt, 0);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
+ rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
+ if (rc) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ c->eip = kvm_rip_read(ctxt->vcpu);
}
rc = X86EMUL_CONTINUE;
+ c->dst.type = OP_NONE;
break;
case 0x32:
/* rdmsr */
- if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
- emulate_gp(ctxt, 0);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
+ rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
+ if (rc) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ c->eip = kvm_rip_read(ctxt->vcpu);
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
c->regs[VCPU_REGS_RDX] = msr_data >> 32;
}
rc = X86EMUL_CONTINUE;
+ c->dst.type = OP_NONE;
break;
case 0x34: /* sysenter */
- rc = emulate_sysenter(ctxt, ops);
+ rc = emulate_sysenter(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ else
+ goto writeback;
break;
case 0x35: /* sysexit */
- rc = emulate_sysexit(ctxt, ops);
+ rc = emulate_sysexit(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ else
+ goto writeback;
break;
case 0x40 ... 0x4f: /* cmov */
c->dst.val = c->dst.orig_val = c->src.val;
@@ -3626,15 +2610,15 @@ twobyte_insn:
case 0x80 ... 0x8f: /* jnz rel, etc*/
if (test_cc(c->b, ctxt->eflags))
jmp_rel(c, c->src.val);
- break;
- case 0x90 ... 0x9f: /* setcc r/m8 */
- c->dst.val = test_cc(c->b, ctxt->eflags);
+ c->dst.type = OP_NONE;
break;
case 0xa0: /* push fs */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
+ emulate_push_sreg(ctxt, VCPU_SREG_FS);
break;
case 0xa1: /* pop fs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
+ if (rc != 0)
+ goto done;
break;
case 0xa3:
bt: /* bt */
@@ -3648,13 +2632,17 @@ twobyte_insn:
emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
break;
case 0xa8: /* push gs */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
+ emulate_push_sreg(ctxt, VCPU_SREG_GS);
break;
case 0xa9: /* pop gs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
+ if (rc != 0)
+ goto done;
break;
case 0xab:
bts: /* bts */
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
break;
case 0xac: /* shrd imm8, r, r/m */
@@ -3677,22 +2665,15 @@ twobyte_insn:
} else {
/* Failure: write the value we saw to EAX. */
c->dst.type = OP_REG;
- c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
}
break;
- case 0xb2: /* lss */
- rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
- break;
case 0xb3:
btr: /* btr */
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
break;
- case 0xb4: /* lfs */
- rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
- break;
- case 0xb5: /* lgs */
- rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
- break;
case 0xb6 ... 0xb7: /* movzx */
c->dst.bytes = c->op_bytes;
c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
@@ -3712,60 +2693,31 @@ twobyte_insn:
break;
case 0xbb:
btc: /* btc */
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
break;
- case 0xbc: { /* bsf */
- u8 zf;
- __asm__ ("bsf %2, %0; setz %1"
- : "=r"(c->dst.val), "=q"(zf)
- : "r"(c->src.val));
- ctxt->eflags &= ~X86_EFLAGS_ZF;
- if (zf) {
- ctxt->eflags |= X86_EFLAGS_ZF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- }
- break;
- }
- case 0xbd: { /* bsr */
- u8 zf;
- __asm__ ("bsr %2, %0; setz %1"
- : "=r"(c->dst.val), "=q"(zf)
- : "r"(c->src.val));
- ctxt->eflags &= ~X86_EFLAGS_ZF;
- if (zf) {
- ctxt->eflags |= X86_EFLAGS_ZF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- }
- break;
- }
case 0xbe ... 0xbf: /* movsx */
c->dst.bytes = c->op_bytes;
c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
(s16) c->src.val;
break;
- case 0xc0 ... 0xc1: /* xadd */
- emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
- /* Write back the register source. */
- c->src.val = c->dst.orig_val;
- write_register_operand(&c->src);
- break;
case 0xc3: /* movnti */
c->dst.bytes = c->op_bytes;
c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
(u64) c->src.val;
break;
case 0xc7: /* Grp9 (cmpxchg8b) */
- rc = emulate_grp9(ctxt, ops);
+ rc = emulate_grp9(ctxt, ops, memop);
+ if (rc != 0)
+ goto done;
+ c->dst.type = OP_NONE;
break;
- default:
- goto cannot_emulate;
}
-
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
goto writeback;
cannot_emulate:
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ c->eip = saved_eip;
return -1;
}
diff --git a/linux/x86/eventfd.c b/linux/x86/eventfd.c
index babaeed..c3cb2fc 100644
--- a/linux/x86/eventfd.c
+++ b/linux/x86/eventfd.c
@@ -42,7 +42,6 @@
* kvm eventfd support - use eventfd objects to signal various KVM events
*
* Copyright 2009 Novell. All Rights Reserved.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Author:
* Gregory Haskins <ghaskins@novell.com>
@@ -85,19 +84,14 @@
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)
struct _irqfd {
- /* Used for MSI fast-path */
- struct kvm *kvm;
- wait_queue_t wait;
- /* Update side is protected by irqfds.lock */
- struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
- /* Used for level IRQ fast-path */
- int gsi;
- struct work_struct inject;
- /* Used for setup/shutdown */
- struct eventfd_ctx *eventfd;
- struct list_head list;
- poll_table pt;
- struct work_struct shutdown;
+ struct kvm *kvm;
+ struct eventfd_ctx *eventfd;
+ int gsi;
+ struct list_head list;
+ poll_table pt;
+ wait_queue_t wait;
+ struct work_struct inject;
+ struct work_struct shutdown;
};
static struct workqueue_struct *irqfd_cleanup_wq;
@@ -171,22 +165,14 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
unsigned long flags = (unsigned long)key;
- struct kvm_kernel_irq_routing_entry *irq;
- struct kvm *kvm = irqfd->kvm;
- if (flags & POLLIN) {
- rcu_read_lock();
- irq = rcu_dereference(irqfd->irq_entry);
+ if (flags & POLLIN)
/* An event has been signaled, inject an interrupt */
- if (irq)
- kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
- else
- schedule_work(&irqfd->inject);
- rcu_read_unlock();
- }
+ schedule_work(&irqfd->inject);
if (flags & POLLHUP) {
/* The eventfd is closing, detach from KVM */
+ struct kvm *kvm = irqfd->kvm;
unsigned long flags;
spin_lock_irqsave(&kvm->irqfds.lock, flags);
@@ -217,31 +203,9 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
add_wait_queue(wqh, &irqfd->wait);
}
-/* Must be called under irqfds.lock */
-static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
- struct kvm_irq_routing_table *irq_rt)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct hlist_node *n;
-
- if (irqfd->gsi >= irq_rt->nr_rt_entries) {
- rcu_assign_pointer(irqfd->irq_entry, NULL);
- return;
- }
-
- hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
- /* Only fast-path MSI. */
- if (e->type == KVM_IRQ_ROUTING_MSI)
- rcu_assign_pointer(irqfd->irq_entry, e);
- else
- rcu_assign_pointer(irqfd->irq_entry, NULL);
- }
-}
-
static int
kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
{
- struct kvm_irq_routing_table *irq_rt;
struct _irqfd *irqfd, *tmp;
struct file *file = NULL;
struct eventfd_ctx *eventfd = NULL;
@@ -255,8 +219,8 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
irqfd->kvm = kvm;
irqfd->gsi = gsi;
INIT_LIST_HEAD(&irqfd->list);
- INIT_WORK(&irqfd->inject, irqfd_inject);
- INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
+ kvm_INIT_WORK(&irqfd->inject, irqfd_inject);
+ kvm_INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
file = eventfd_fget(fd);
if (IS_ERR(file)) {
@@ -291,13 +255,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
goto fail;
}
- irq_rt = rcu_dereference_protected(kvm->irq_routing,
- lockdep_is_held(&kvm->irqfds.lock));
- irqfd_update(kvm, irqfd, irq_rt);
-
events = file->f_op->poll(file, &irqfd->pt);
list_add_tail(&irqfd->list, &kvm->irqfds.items);
+ spin_unlock_irq(&kvm->irqfds.lock);
/*
* Check if there was an event already pending on the eventfd
@@ -306,8 +267,6 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
if (events & POLLIN)
schedule_work(&irqfd->inject);
- spin_unlock_irq(&kvm->irqfds.lock);
-
/*
* do not drop the file until the irqfd is fully initialized, otherwise
* we might race against the POLLHUP
@@ -351,17 +310,8 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
spin_lock_irq(&kvm->irqfds.lock);
list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
- if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
- /*
- * This rcu_assign_pointer is needed for when
- * another thread calls kvm_irqfd_update before
- * we flush workqueue below.
- * It is paired with synchronize_rcu done by caller
- * of that function.
- */
- rcu_assign_pointer(irqfd->irq_entry, NULL);
+ if (irqfd->eventfd == eventfd && irqfd->gsi == gsi)
irqfd_deactivate(irqfd);
- }
}
spin_unlock_irq(&kvm->irqfds.lock);
@@ -411,25 +361,6 @@ kvm_irqfd_release(struct kvm *kvm)
}
/*
- * Change irq_routing and irqfd.
- * Caller must invoke synchronize_rcu afterwards.
- */
-void kvm_irq_routing_update(struct kvm *kvm,
- struct kvm_irq_routing_table *irq_rt)
-{
- struct _irqfd *irqfd;
-
- spin_lock_irq(&kvm->irqfds.lock);
-
- rcu_assign_pointer(kvm->irq_routing, irq_rt);
-
- list_for_each_entry(irqfd, &kvm->irqfds.items, list)
- irqfd_update(kvm, irqfd, irq_rt);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-}
-
-/*
* create a host-wide workqueue for issuing deferred shutdown requests
* aggregated from all vm* instances. We need our own isolated single-thread
* queue to prevent deadlock against flushing the normal work-queue.
@@ -697,9 +628,4 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
#else
void kvm_eventfd_init(struct kvm *kvm) { }
void kvm_irqfd_release(struct kvm *kvm) { }
-void kvm_irq_routing_update(struct kvm *kvm,
- struct kvm_irq_routing_table *irq_rt)
-{
- rcu_assign_pointer(kvm->irq_routing, irq_rt);
-}
#endif
diff --git a/linux/x86/i8254.c b/linux/x86/i8254.c
index 64aa827..13dc226 100644
--- a/linux/x86/i8254.c
+++ b/linux/x86/i8254.c
@@ -45,7 +45,6 @@
* Copyright (c) 2006 Intel Corporation
* Copyright (c) 2007 Keir Fraser, XenSource Inc
* Copyright (c) 2008 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -74,7 +73,6 @@
#include <linux/kvm_host.h>
#include <linux/slab.h>
-#include <linux/workqueue.h>
#include "irq.h"
#include "i8254.h"
@@ -272,26 +270,24 @@ static void pit_latch_status(struct kvm *kvm, int channel)
}
}
+int pit_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+
+ if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
+ return atomic_read(&pit->pit_state.pit_timer.pending);
+ return 0;
+}
+
static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- int value;
-
- spin_lock(&ps->inject_lock);
- value = atomic_dec_return(&ps->pit_timer.pending);
- if (value < 0)
- /* spurious acks can be generated if, for example, the
- * PIC is being reset. Handle it gracefully here
- */
+ raw_spin_lock(&ps->inject_lock);
+ if (atomic_dec_return(&ps->pit_timer.pending) < 0)
atomic_inc(&ps->pit_timer.pending);
- else if (value > 0)
- /* in this case, we had multiple outstanding pit interrupts
- * that we needed to inject. Reinject
- */
- queue_work(ps->pit->wq, &ps->pit->expired);
ps->irq_ack = 1;
- spin_unlock(&ps->inject_lock);
+ raw_spin_unlock(&ps->inject_lock);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -303,14 +299,14 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
return;
timer = &pit->pit_state.pit_timer.timer;
- if (hrtimer_cancel(timer))
+ if (hrtimer_cancel_p(timer))
kvm_hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
-static void destroy_pit_timer(struct kvm_pit *pit)
+static void destroy_pit_timer(struct kvm_timer *pt)
{
- hrtimer_cancel(&pit->pit_state.pit_timer.timer);
- cancel_work_sync(&pit->expired);
+ pr_debug("pit: " "execute del timer!\n");
+ hrtimer_cancel_p(&pt->timer);
}
static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -324,60 +320,6 @@ static struct kvm_timer_ops kpit_ops = {
.is_periodic = kpit_is_periodic,
};
-static void pit_do_work(struct work_struct *work)
-{
- struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
- struct kvm *kvm = pit->kvm;
- struct kvm_vcpu *vcpu;
- int i;
- struct kvm_kpit_state *ps = &pit->pit_state;
- int inject = 0;
-
- /* Try to inject pending interrupts when
- * last one has been acked.
- */
- spin_lock(&ps->inject_lock);
- if (ps->irq_ack) {
- ps->irq_ack = 0;
- inject = 1;
- }
- spin_unlock(&ps->inject_lock);
- if (inject) {
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-
- /*
- * Provides NMI watchdog support via Virtual Wire mode.
- * The route is: PIT -> PIC -> LVT0 in NMI mode.
- *
- * Note: Our Virtual Wire implementation is simplified, only
- * propagating PIT interrupts to all VCPUs when they have set
- * LVT0 to NMI delivery. Other PIC interrupts are just sent to
- * VCPU0, and only if its LVT0 is in EXTINT mode.
- */
- if (kvm->arch.vapics_in_nmi_mode > 0)
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_apic_nmi_wd_deliver(vcpu);
- }
-}
-
-static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
-{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_pit *pt = ktimer->kvm->arch.vpit;
-
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
- queue_work(pt->wq, &pt->expired);
- }
-
- if (ktimer->t_ops->is_periodic(ktimer)) {
- kvm_hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
- return HRTIMER_RESTART;
- } else
- return HRTIMER_NORESTART;
-}
-
static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
struct kvm_timer *pt = &ps->pit_timer;
@@ -388,19 +330,20 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
pr_debug("pit: " "create pit timer, interval is %llu nsec\n", interval);
/* TODO The new value only affected after the retriggered */
- hrtimer_cancel(&pt->timer);
- cancel_work_sync(&ps->pit->expired);
+ hrtimer_cancel_p(&pt->timer);
pt->period = interval;
ps->is_periodic = is_period;
- pt->timer.function = pit_timer_fn;
+ pt->timer.function = kvm_timer_fn;
+ hrtimer_data_pointer(&pt->timer);
pt->t_ops = &kpit_ops;
pt->kvm = ps->pit->kvm;
+ pt->vcpu = pt->kvm->bsp_vcpu;
atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
- hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+ hrtimer_start_p(&pt->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
}
@@ -444,7 +387,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
}
break;
default:
- destroy_pit_timer(kvm->arch.vpit);
+ destroy_pit_timer(&ps->pit_timer);
}
}
@@ -723,23 +666,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
- spin_lock_init(&pit->pit_state.inject_lock);
-
- pit->wq = create_singlethread_workqueue("kvm-pit-wq");
- if (!pit->wq) {
- mutex_unlock(&pit->pit_state.lock);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
- kfree(pit);
- return NULL;
- }
- INIT_WORK(&pit->expired, pit_do_work);
+ raw_spin_lock_init(&pit->pit_state.inject_lock);
kvm->arch.vpit = pit;
pit->kvm = kvm;
pit_state = &pit->pit_state;
pit_state->pit = pit;
- hrtimer_init(&pit_state->pit_timer.timer,
+ hrtimer_init_p(&pit_state->pit_timer.timer,
CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
@@ -774,7 +708,7 @@ fail:
kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
kvm_free_irq_source_id(kvm, pit->irq_source_id);
- destroy_workqueue(pit->wq);
+
kfree(pit);
return NULL;
}
@@ -784,20 +718,61 @@ void kvm_free_pit(struct kvm *kvm)
struct hrtimer *timer;
if (kvm->arch.vpit) {
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &kvm->arch.vpit->speaker_dev);
kvm_unregister_irq_mask_notifier(kvm, 0,
&kvm->arch.vpit->mask_notifier);
kvm_unregister_irq_ack_notifier(kvm,
&kvm->arch.vpit->pit_state.irq_ack_notifier);
mutex_lock(&kvm->arch.vpit->pit_state.lock);
timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
- hrtimer_cancel(timer);
- cancel_work_sync(&kvm->arch.vpit->expired);
+ hrtimer_cancel_p(timer);
kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- destroy_workqueue(kvm->arch.vpit->wq);
kfree(kvm->arch.vpit);
}
}
+
+static void __inject_pit_timer_intr(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> PIC -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation is simplified, only
+ * propagating PIT interrupts to all VCPUs when they have set
+ * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+ * VCPU0, and only if its LVT0 is in EXTINT mode.
+ */
+ if (kvm->arch.vapics_in_nmi_mode > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
+}
+
+void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kpit_state *ps;
+
+ if (pit) {
+ int inject = 0;
+ ps = &pit->pit_state;
+
+ /* Try to inject pending interrupts when
+ * last one has been acked.
+ */
+ raw_spin_lock(&ps->inject_lock);
+ if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
+ ps->irq_ack = 0;
+ inject = 1;
+ }
+ raw_spin_unlock(&ps->inject_lock);
+ if (inject)
+ __inject_pit_timer_intr(kvm);
+ }
+}
diff --git a/linux/x86/i8254.h b/linux/x86/i8254.h
index 46d08ca..900d6b0 100644
--- a/linux/x86/i8254.h
+++ b/linux/x86/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
- spinlock_t inject_lock;
+ raw_spinlock_t inject_lock;
unsigned long irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
@@ -40,8 +40,6 @@ struct kvm_pit {
struct kvm_kpit_state pit_state;
int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
- struct workqueue_struct *wq;
- struct work_struct expired;
};
#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/linux/x86/i8259.c b/linux/x86/i8259.c
index d316fc6..76c25b5 100644
--- a/linux/x86/i8259.c
+++ b/linux/x86/i8259.c
@@ -43,7 +43,6 @@
*
* Copyright (c) 2003-2004 Fabrice Bellard
* Copyright (c) 2007 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -74,44 +73,6 @@
#include <linux/kvm_host.h>
#include "trace.h"
-static void pic_irq_request(struct kvm *kvm, int level);
-
-static void pic_lock(struct kvm_pic *s)
- __acquires(&s->lock)
-{
- spin_lock(&s->lock);
-}
-
-static void pic_unlock(struct kvm_pic *s)
- __releases(&s->lock)
-{
- bool wakeup = s->wakeup_needed;
- struct kvm_vcpu *vcpu, *found = NULL;
- int i;
-
- s->wakeup_needed = false;
-
- spin_unlock(&s->lock);
-
- if (wakeup) {
- kvm_for_each_vcpu(i, vcpu, s->kvm) {
- if (kvm_apic_accept_pic_intr(vcpu)) {
- found = vcpu;
- break;
- }
- }
-
- if (!found)
- found = s->kvm->bsp_vcpu;
-
- if (!found)
- return;
-
- kvm_make_request(KVM_REQ_EVENT, found);
- kvm_vcpu_kick(found);
- }
-}
-
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
@@ -124,19 +85,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
* Other interrupt may be delivered to PIC while lock is dropped but
* it should be safe since PIC state is already updated at this stage.
*/
- pic_unlock(s->pics_state);
+ raw_spin_unlock(&s->pics_state->lock);
kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
- pic_lock(s->pics_state);
+ raw_spin_lock(&s->pics_state->lock);
}
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
- pic_lock(s);
+ raw_spin_lock(&s->lock);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
- pic_unlock(s);
+ raw_spin_unlock(&s->lock);
}
/*
@@ -229,14 +190,17 @@ static void pic_update_irq(struct kvm_pic *s)
pic_set_irq1(&s->pics[0], 2, 0);
}
irq = pic_get_irq(&s->pics[0]);
- pic_irq_request(s->kvm, irq >= 0);
+ if (irq >= 0)
+ s->irq_request(s->irq_request_opaque, 1);
+ else
+ s->irq_request(s->irq_request_opaque, 0);
}
void kvm_pic_update_irq(struct kvm_pic *s)
{
- pic_lock(s);
+ raw_spin_lock(&s->lock);
pic_update_irq(s);
- pic_unlock(s);
+ raw_spin_unlock(&s->lock);
}
int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -244,14 +208,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
struct kvm_pic *s = opaque;
int ret = -1;
- pic_lock(s);
+ raw_spin_lock(&s->lock);
if (irq >= 0 && irq < PIC_NUM_PINS) {
ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
pic_update_irq(s);
trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
s->pics[irq >> 3].imr, ret == 0);
}
- pic_unlock(s);
+ raw_spin_unlock(&s->lock);
return ret;
}
@@ -281,7 +245,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
- pic_lock(s);
+ raw_spin_lock(&s->lock);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
pic_intack(&s->pics[0], irq);
@@ -306,7 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
- pic_unlock(s);
+ raw_spin_unlock(&s->lock);
return intno;
}
@@ -314,7 +278,8 @@ int kvm_pic_read_irq(struct kvm *kvm)
void kvm_pic_reset(struct kvm_kpic_state *s)
{
int irq;
- struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
+ struct kvm *kvm = s->pics_state->irq_request_opaque;
+ struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
u8 irr = s->irr, isr = s->imr;
s->last_irr = 0;
@@ -349,17 +314,14 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
addr &= 1;
if (addr == 0) {
if (val & 0x10) {
- s->init4 = val & 1;
- s->last_irr = 0;
- s->imr = 0;
- s->priority_add = 0;
- s->special_mask = 0;
- s->read_reg_select = 0;
- if (!s->init4) {
- s->special_fully_nested_mode = 0;
- s->auto_eoi = 0;
- }
+ kvm_pic_reset(s); /* init */
+ /*
+ * deassert a pending interrupt
+ */
+ s->pics_state->irq_request(s->pics_state->
+ irq_request_opaque, 0);
s->init_state = 1;
+ s->init4 = val & 1;
if (val & 0x02)
printk(KERN_ERR "single mode not supported");
if (val & 0x08)
@@ -411,20 +373,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
}
} else
switch (s->init_state) {
- case 0: { /* normal mode */
- u8 imr_diff = s->imr ^ val,
- off = (s == &s->pics_state->pics[0]) ? 0 : 8;
+ case 0: /* normal mode */
s->imr = val;
- for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
- if (imr_diff & (1 << irq))
- kvm_fire_mask_notifiers(
- s->pics_state->kvm,
- SELECT_PIC(irq + off),
- irq + off,
- !!(s->imr & (1 << irq)));
pic_update_irq(s->pics_state);
break;
- }
case 1:
s->irq_base = val & 0xf8;
s->init_state = 2;
@@ -532,7 +484,7 @@ static int picdev_write(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte write\n");
return 0;
}
- pic_lock(s);
+ raw_spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -545,7 +497,7 @@ static int picdev_write(struct kvm_io_device *this,
elcr_ioport_write(&s->pics[addr & 1], addr, data);
break;
}
- pic_unlock(s);
+ raw_spin_unlock(&s->lock);
return 0;
}
@@ -562,7 +514,7 @@ static int picdev_read(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte read\n");
return 0;
}
- pic_lock(s);
+ raw_spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -576,15 +528,16 @@ static int picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
- pic_unlock(s);
+ raw_spin_unlock(&s->lock);
return 0;
}
/*
* callback when PIC0 irq status changed
*/
-static void pic_irq_request(struct kvm *kvm, int level)
+static void pic_irq_request(void *opaque, int level)
{
+ struct kvm *kvm = opaque;
struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
struct kvm_pic *s = pic_irqchip(kvm);
int irq = pic_get_irq(&s->pics[0]);
@@ -592,7 +545,7 @@ static void pic_irq_request(struct kvm *kvm, int level)
s->output = level;
if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
s->pics[0].isr_ack &= ~(1 << irq);
- s->wakeup_needed = true;
+ kvm_vcpu_kick(vcpu);
}
}
@@ -609,14 +562,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
return NULL;
- spin_lock_init(&s->lock);
+ raw_spin_lock_init(&s->lock);
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
+ s->irq_request = pic_irq_request;
+ s->irq_request_opaque = kvm;
s->pics[0].pics_state = s;
s->pics[1].pics_state = s;
- s->pics[0].isr_ack = 0xff;
- s->pics[1].isr_ack = 0xff;
/*
* Initialize PIO device
diff --git a/linux/x86/ioapic.c b/linux/x86/ioapic.c
index f19b670..c87f9c0 100644
--- a/linux/x86/ioapic.c
+++ b/linux/x86/ioapic.c
@@ -40,7 +40,6 @@
#endif
/*
* Copyright (C) 2001 MandrakeSoft S.A.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* MandrakeSoft S.A.
* 43, rue d'Aboukir
@@ -192,7 +191,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
update_handled_vectors(ioapic);
mask_after = e->fields.mask;
if (mask_before != mask_after)
- kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
+ kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
&& ioapic->irr & (1 << index))
ioapic_service(ioapic, index);
@@ -233,13 +232,12 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
{
- u32 old_irr;
+ u32 old_irr = ioapic->irr;
u32 mask = 1 << irq;
union kvm_ioapic_redirect_entry entry;
int ret = 1;
spin_lock(&ioapic->lock);
- old_irr = ioapic->irr;
if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
entry = ioapic->redirtbl[irq];
level ^= entry.fields.polarity;
diff --git a/linux/x86/iommu.c b/linux/x86/iommu.c
index 381142f..20bf46f 100644
--- a/linux/x86/iommu.c
+++ b/linux/x86/iommu.c
@@ -56,8 +56,6 @@
*
* Copyright (C) 2006-2008 Intel Corporation
* Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
* Author: Allen M. Kay <allen.m.kay@intel.com>
* Author: Weidong Han <weidong.han@intel.com>
* Author: Ben-Ami Yassour <benami@il.ibm.com>
@@ -74,30 +72,12 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, unsigned long size)
-{
- gfn_t end_gfn;
- pfn_t pfn;
-
- pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
- end_gfn = gfn + (size >> PAGE_SHIFT);
- gfn += 1;
-
- if (is_error_pfn(pfn))
- return pfn;
-
- while (gfn < end_gfn)
- gfn_to_pfn_memslot(kvm, slot, gfn++);
-
- return pfn;
-}
-
int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
{
- gfn_t gfn, end_gfn;
+ gfn_t gfn = slot->base_gfn;
+ unsigned long npages = slot->npages;
pfn_t pfn;
- int r = 0;
+ int i, r = 0;
struct iommu_domain *domain = kvm->arch.iommu_domain;
int flags;
@@ -105,79 +85,46 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
if (!domain)
return 0;
- gfn = slot->base_gfn;
- end_gfn = gfn + slot->npages;
-
flags = IOMMU_READ | IOMMU_WRITE;
if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
flags |= IOMMU_CACHE;
-
- while (gfn < end_gfn) {
- unsigned long page_size;
-
- /* Check if already mapped */
- if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
- gfn += 1;
- continue;
- }
-
- /* Get the page size we could use to map */
- page_size = kvm_host_page_size(kvm, gfn);
-
- /* Make sure the page_size does not exceed the memslot */
- while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
- page_size >>= 1;
-
- /* Make sure gfn is aligned to the page size we want to map */
- while ((gfn << PAGE_SHIFT) & (page_size - 1))
- page_size >>= 1;
-
- /*
- * Pin all pages we are about to map in memory. This is
- * important because we unmap and unpin in 4kb steps later.
- */
- pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
- if (is_error_pfn(pfn)) {
- gfn += 1;
+ for (i = 0; i < npages; i++) {
+ /* check if already mapped */
+ if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
continue;
- }
- /* Map into IO address space */
- r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
- get_order(page_size), flags);
+ pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
+ r = iommu_map_range(domain,
+ gfn_to_gpa(gfn),
+ pfn_to_hpa(pfn),
+ PAGE_SIZE, flags);
if (r) {
printk(KERN_ERR "kvm_iommu_map_address:"
- "iommu failed to map pfn=%llx\n", pfn);
+ "iommu failed to map pfn=%lx\n", pfn);
goto unmap_pages;
}
-
- gfn += page_size >> PAGE_SHIFT;
-
-
+ gfn++;
}
-
return 0;
unmap_pages:
- kvm_iommu_put_pages(kvm, slot->base_gfn, gfn);
+ kvm_iommu_put_pages(kvm, slot->base_gfn, i);
return r;
}
static int kvm_iommu_map_memslots(struct kvm *kvm)
{
- int i, idx, r = 0;
+ int i, r = 0;
struct kvm_memslots *slots;
- idx = srcu_read_lock(&kvm->srcu);
- slots = kvm_memslots(kvm);
+ slots = rcu_dereference(kvm->memslots);
for (i = 0; i < slots->nmemslots; i++) {
r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
if (r)
break;
}
- srcu_read_unlock(&kvm->srcu, idx);
return r;
}
@@ -282,62 +229,40 @@ out_unmap:
return r;
}
-static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
-{
- unsigned long i;
-
- for (i = 0; i < npages; ++i)
- kvm_release_pfn_clean(pfn + i);
-}
-
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages)
{
- struct iommu_domain *domain;
- gfn_t end_gfn, gfn;
+ gfn_t gfn = base_gfn;
pfn_t pfn;
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+ unsigned long i;
u64 phys;
- domain = kvm->arch.iommu_domain;
- end_gfn = base_gfn + npages;
- gfn = base_gfn;
-
/* check if iommu exists and in use */
if (!domain)
return;
- while (gfn < end_gfn) {
- unsigned long unmap_pages;
- int order;
-
- /* Get physical address */
+ for (i = 0; i < npages; i++) {
phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
- pfn = phys >> PAGE_SHIFT;
-
- /* Unmap address from IO address space */
- order = iommu_unmap(domain, gfn_to_gpa(gfn), 0);
- unmap_pages = 1ULL << order;
-
- /* Unpin all pages we just unmapped to not leak any memory */
- kvm_unpin_pages(kvm, pfn, unmap_pages);
-
- gfn += unmap_pages;
+ pfn = phys >> PAGE_SHIFT;
+ kvm_release_pfn_clean(pfn);
+ gfn++;
}
+
+ iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGE_SIZE * npages);
}
static int kvm_iommu_unmap_memslots(struct kvm *kvm)
{
- int i, idx;
+ int i;
struct kvm_memslots *slots;
- idx = srcu_read_lock(&kvm->srcu);
- slots = kvm_memslots(kvm);
+ slots = rcu_dereference(kvm->memslots);
for (i = 0; i < slots->nmemslots; i++) {
kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
slots->memslots[i].npages);
}
- srcu_read_unlock(&kvm->srcu, idx);
return 0;
}
diff --git a/linux/x86/irq.c b/linux/x86/irq.c
index d1372c0..255aaf7 100644
--- a/linux/x86/irq.c
+++ b/linux/x86/irq.c
@@ -41,7 +41,6 @@
/*
* irq.c: API for in kernel interrupt controller
* Copyright (c) 2007, Intel Corporation.
- * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -73,7 +72,12 @@
*/
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
- return apic_has_pending_timer(vcpu);
+ int ret;
+
+ ret = pit_has_pending_timer(vcpu);
+ ret |= apic_has_pending_timer(vcpu);
+
+ return ret;
}
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
@@ -125,6 +129,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
kvm_inject_apic_timer_irqs(vcpu);
+ kvm_inject_pit_timer_irqs(vcpu);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/linux/x86/irq.h b/linux/x86/irq.h
index 90f1923..a4cb03d 100644
--- a/linux/x86/irq.h
+++ b/linux/x86/irq.h
@@ -38,11 +38,14 @@
struct kvm;
struct kvm_vcpu;
+typedef void irq_request_func(void *opaque, int level);
+
struct kvm_kpic_state {
u8 last_irr; /* edge detection */
u8 irr; /* interrupt request register */
u8 imr; /* interrupt mask register */
u8 isr; /* interrupt service register */
+ u8 isr_ack; /* interrupt ack detection */
u8 priority_add; /* highest irq priority */
u8 irq_base;
u8 read_reg_select;
@@ -55,16 +58,16 @@ struct kvm_kpic_state {
u8 init4; /* true if 4 byte init */
u8 elcr; /* PIIX edge/trigger selection */
u8 elcr_mask;
- u8 isr_ack; /* interrupt ack detection */
struct kvm_pic *pics_state;
};
struct kvm_pic {
- spinlock_t lock;
- bool wakeup_needed;
+ raw_spinlock_t lock;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+ irq_request_func *irq_request;
+ void *irq_request_opaque;
int output; /* intr from master PIC */
struct kvm_io_device dev;
void (*ack_notifier)(void *opaque, int irq);
diff --git a/linux/x86/irq_comm.c b/linux/x86/irq_comm.c
index 646d331..bbf8e05 100644
--- a/linux/x86/irq_comm.c
+++ b/linux/x86/irq_comm.c
@@ -57,7 +57,6 @@
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
*
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
#include <linux/kvm_host.h>
@@ -140,7 +139,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
if (r < 0)
r = 0;
r += kvm_apic_set_irq(vcpu, irq);
- } else if (kvm_lapic_enabled(vcpu)) {
+ } else {
if (!lowest)
lowest = vcpu;
else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
@@ -154,8 +153,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level)
+static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level)
{
struct kvm_lapic_irq irq;
@@ -319,19 +318,15 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
synchronize_rcu();
}
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask)
+void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
{
struct kvm_irq_mask_notifier *kimn;
struct hlist_node *n;
- int gsi;
rcu_read_lock();
- gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
- if (gsi != -1)
- hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
- if (kimn->irq == gsi)
- kimn->func(kimn, mask);
+ hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
+ if (kimn->irq == irq)
+ kimn->func(kimn, mask);
rcu_read_unlock();
}
@@ -449,9 +444,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
mutex_lock(&kvm->irq_lock);
old = kvm->irq_routing;
- kvm_irq_routing_update(kvm, new);
+ rcu_assign_pointer(kvm->irq_routing, new);
mutex_unlock(&kvm->irq_lock);
-
synchronize_rcu();
new = old;
diff --git a/linux/x86/kvm_cache_regs.h b/linux/x86/kvm_cache_regs.h
index 3377d53..cff851c 100644
--- a/linux/x86/kvm_cache_regs.h
+++ b/linux/x86/kvm_cache_regs.h
@@ -36,20 +36,11 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
- might_sleep(); /* on svm */
-
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
- return vcpu->arch.walk_mmu->pdptrs[index];
-}
-
-static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
-{
- load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
-
- return mmu->pdptrs[index];
+ return vcpu->arch.pdptrs[index];
}
static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
@@ -73,37 +64,9 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr4 & mask;
}
-static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
-{
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- kvm_x86_ops->decache_cr3(vcpu);
- return vcpu->arch.cr3;
-}
-
static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, ~0UL);
}
-static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
-{
- return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
- | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
-}
-
-static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.hflags |= HF_GUEST_MASK;
-}
-
-static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.hflags &= ~HF_GUEST_MASK;
-}
-
-static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hflags & HF_GUEST_MASK;
-}
-
#endif
diff --git a/linux/x86/kvm_main.c b/linux/x86/kvm_main.c
index 355948e..19f1924 100644
--- a/linux/x86/kvm_main.c
+++ b/linux/x86/kvm_main.c
@@ -45,7 +45,6 @@
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
@@ -95,12 +94,11 @@
#include <asm-generic/bitops/le.h>
#include "coalesced_mmio.h"
-#include "async_pf.h"
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>
-MODULE_INFO(version, "kvm-kmod-2.6.38-rc7");
+MODULE_INFO(version, "kvm-kmod-2.6.34");
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -131,40 +129,15 @@ static void hardware_disable_all(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-bool kvm_rebooting;
-EXPORT_SYMBOL_GPL(kvm_rebooting);
+static bool kvm_rebooting;
static bool largepages_enabled = true;
-static struct page *hwpoison_page;
-static pfn_t hwpoison_pfn;
-
-static struct page *fault_page;
-static pfn_t fault_pfn;
-
inline int kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn)) {
- int reserved;
- struct page *tail = pfn_to_page(pfn);
- struct page *head = compound_trans_head(tail);
- reserved = PageReserved(head);
- if (head != tail) {
- /*
- * "head" is not a dangling pointer
- * (compound_trans_head takes care of that)
- * but the hugepage may have been splitted
- * from under us (and we may not hold a
- * reference count on the head page so it can
- * be reused before we run PageReferenced), so
- * we've to check PageTail before returning
- * what we just read.
- */
- smp_rmb();
- if (PageTail(tail))
- return reserved;
- }
- return PageReserved(tail);
+ struct page *page = compound_head(pfn_to_page(pfn));
+ return PageReserved(page);
}
return true;
@@ -210,7 +183,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
raw_spin_lock(&kvm->requests_lock);
me = smp_processor_id();
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (kvm_make_check_request(req, vcpu))
+ if (test_and_set_bit(req, &vcpu->requests))
continue;
cpu = vcpu->cpu;
if (cpus != NULL && cpu != -1 && cpu != me)
@@ -229,12 +202,8 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
- int dirty_count = kvm->tlbs_dirty;
-
- smp_mb();
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm->stat.remote_tlb_flush;
- cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
}
void kvm_reload_remote_mmus(struct kvm *kvm)
@@ -252,7 +221,6 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
init_waitqueue_head(&vcpu->wq);
- kvm_async_pf_vcpu_init(vcpu);
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
@@ -311,12 +279,12 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
* pte after kvm_unmap_hva returned, without noticing the page
* is going to be freed.
*/
- idx = srcu_read_lock(&kvm->srcu);
+ idx = kvm_srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
- need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
+ need_tlb_flush = kvm_unmap_hva(kvm, address);
spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
+ kvm_srcu_read_unlock(&kvm->srcu, idx);
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -335,12 +303,12 @@ void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int idx;
- idx = srcu_read_lock(&kvm->srcu);
+ idx = kvm_srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
kvm_set_spte_hva(kvm, address, pte);
spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
+ kvm_srcu_read_unlock(&kvm->srcu, idx);
}
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
@@ -351,7 +319,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int need_tlb_flush = 0, idx;
- idx = srcu_read_lock(&kvm->srcu);
+ idx = kvm_srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
/*
* The count increase must become visible at unlock time as no
@@ -361,9 +329,8 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
kvm->mmu_notifier_count++;
for (; start < end; start += PAGE_SIZE)
need_tlb_flush |= kvm_unmap_hva(kvm, start);
- need_tlb_flush |= kvm->tlbs_dirty;
spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
+ kvm_srcu_read_unlock(&kvm->srcu, idx);
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -403,11 +370,11 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int young, idx;
- idx = srcu_read_lock(&kvm->srcu);
+ idx = kvm_srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
young = kvm_age_hva(kvm, address);
spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
+ kvm_srcu_read_unlock(&kvm->srcu, idx);
if (young)
kvm_flush_remote_tlbs(kvm);
@@ -415,33 +382,15 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
return young;
}
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38)
-static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long address)
-{
- struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int young, idx;
-
- idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
- young = kvm_test_age_hva(kvm, address);
- spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
-
- return young;
-}
-#endif
-
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
struct mm_struct *mm)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int idx;
- idx = srcu_read_lock(&kvm->srcu);
+ idx = kvm_srcu_read_lock(&kvm->srcu);
kvm_arch_flush_shadow(kvm);
- srcu_read_unlock(&kvm->srcu, idx);
+ kvm_srcu_read_unlock(&kvm->srcu, idx);
}
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
@@ -449,9 +398,6 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38)
- .test_young = kvm_mmu_notifier_test_young,
-#endif
#ifdef MMU_NOTIFIER_HAS_CHANGE_PTE
.change_pte = kvm_mmu_notifier_change_pte,
#endif
@@ -475,15 +421,11 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
static struct kvm *kvm_create_vm(void)
{
- int r, i;
- struct kvm *kvm = kvm_arch_alloc_vm();
-
- if (!kvm)
- return ERR_PTR(-ENOMEM);
+ int r = 0, i;
+ struct kvm *kvm = kvm_arch_create_vm();
- r = kvm_arch_init_vm(kvm);
- if (r)
- goto out_err_nodisable;
+ if (IS_ERR(kvm))
+ goto out;
r = hardware_enable_all();
if (r)
@@ -497,19 +439,23 @@ static struct kvm *kvm_create_vm(void)
r = -ENOMEM;
kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!kvm->memslots)
- goto out_err_nosrcu;
- if (init_srcu_struct(&kvm->srcu))
- goto out_err_nosrcu;
+ goto out_err;
+ if (kvm_init_srcu_struct(&kvm->srcu))
+ goto out_err;
for (i = 0; i < KVM_NR_BUSES; i++) {
kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
GFP_KERNEL);
- if (!kvm->buses[i])
+ if (!kvm->buses[i]) {
+ kvm_cleanup_srcu_struct(&kvm->srcu);
goto out_err;
+ }
}
r = kvm_init_mmu_notifier(kvm);
- if (r)
+ if (r) {
+ kvm_cleanup_srcu_struct(&kvm->srcu);
goto out_err;
+ }
kvm->mm = current->mm;
mmget(&kvm->mm->mm_count);
@@ -523,35 +469,22 @@ static struct kvm *kvm_create_vm(void)
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
-
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+ kvm_coalesced_mmio_init(kvm);
+#endif
+out:
return kvm;
out_err:
- cleanup_srcu_struct(&kvm->srcu);
-out_err_nosrcu:
hardware_disable_all();
out_err_nodisable:
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm->buses[i]);
kfree(kvm->memslots);
- kvm_arch_free_vm(kvm);
+ kfree(kvm);
return ERR_PTR(r);
}
-static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
-{
- if (!memslot->dirty_bitmap)
- return;
-
- if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
- vfree(memslot->dirty_bitmap_head);
- else
- kfree(memslot->dirty_bitmap_head);
-
- memslot->dirty_bitmap = NULL;
- memslot->dirty_bitmap_head = NULL;
-}
-
/*
* Free any memory in @free but not in @dont.
*/
@@ -564,7 +497,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
vfree(free->rmap);
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
- kvm_destroy_dirty_bitmap(free);
+ vfree(free->dirty_bitmap);
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
@@ -575,6 +508,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
}
free->npages = 0;
+ free->dirty_bitmap = NULL;
free->rmap = NULL;
}
@@ -608,9 +542,6 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_arch_flush_shadow(kvm);
#endif
kvm_arch_destroy_vm(kvm);
- kvm_free_physmem(kvm);
- cleanup_srcu_struct(&kvm->srcu);
- kvm_arch_free_vm(kvm);
hardware_disable_all();
mmdrop(mm);
}
@@ -640,27 +571,6 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
}
/*
- * Allocation size is twice as large as the actual dirty bitmap size.
- * This makes it possible to do double buffering: see x86's
- * kvm_vm_ioctl_get_dirty_log().
- */
-static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
-{
- unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
-
- if (dirty_bytes > PAGE_SIZE)
- memslot->dirty_bitmap = vzalloc(dirty_bytes);
- else
- memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
-
- if (!memslot->dirty_bitmap)
- return -ENOMEM;
-
- memslot->dirty_bitmap_head = memslot->dirty_bitmap;
- return 0;
-}
-
-/*
* Allocate some memory and give it an address in the guest physical address
* space.
*
@@ -697,16 +607,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
npages = mem->memory_size >> PAGE_SHIFT;
- r = -EINVAL;
- if (npages > KVM_MEM_MAX_NR_PAGES)
- goto out;
-
if (!npages)
mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
new = old = *memslot;
- new.id = mem->slot;
new.base_gfn = base_gfn;
new.npages = npages;
new.flags = mem->flags;
@@ -737,11 +642,13 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* Allocate if a slot is being created */
#ifndef CONFIG_S390
if (npages && !new.rmap) {
- new.rmap = vzalloc(npages * sizeof(*new.rmap));
+ new.rmap = vmalloc(npages * sizeof(struct page *));
if (!new.rmap)
goto out_free;
+ memset(new.rmap, 0, npages * sizeof(*new.rmap));
+
new.user_alloc = user_alloc;
new.userspace_addr = mem->userspace_addr;
}
@@ -760,18 +667,21 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (new.lpage_info[i])
continue;
- lpages = 1 + ((base_gfn + npages - 1)
- >> KVM_HPAGE_GFN_SHIFT(level));
- lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
+ lpages = 1 + (base_gfn + npages - 1) /
+ KVM_PAGES_PER_HPAGE(level);
+ lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level);
- new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
+ new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
if (!new.lpage_info[i])
goto out_free;
- if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+ memset(new.lpage_info[i], 0,
+ lpages * sizeof(*new.lpage_info[i]));
+
+ if (base_gfn % KVM_PAGES_PER_HPAGE(level))
new.lpage_info[i][0].write_count = 1;
- if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+ if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level))
new.lpage_info[i][lpages - 1].write_count = 1;
ugfn = new.userspace_addr >> PAGE_SHIFT;
/*
@@ -789,8 +699,12 @@ skip_lpage:
/* Allocate page dirty bitmap if needed */
if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
- if (kvm_create_dirty_bitmap(&new) < 0)
+ unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new);
+
+ new.dirty_bitmap = vmalloc(dirty_bytes);
+ if (!new.dirty_bitmap)
goto out_free;
+ memset(new.dirty_bitmap, 0, dirty_bytes);
/* destroy any largepage mappings for dirty tracking */
if (old.npages)
flush_shadow = 1;
@@ -809,7 +723,6 @@ skip_lpage:
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
if (mem->slot >= slots->nmemslots)
slots->nmemslots = mem->slot + 1;
- slots->generation++;
slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
old_memslots = kvm->memslots;
@@ -830,12 +743,14 @@ skip_lpage:
if (r)
goto out_free;
+#ifdef CONFIG_DMAR
/* map the pages in iommu page table */
if (npages) {
r = kvm_iommu_map_pages(kvm, &new);
if (r)
goto out_free;
}
+#endif
r = -ENOMEM;
slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
@@ -844,7 +759,6 @@ skip_lpage:
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
if (mem->slot >= slots->nmemslots)
slots->nmemslots = mem->slot + 1;
- slots->generation++;
/* actual memory is freed via old in kvm_free_physmem_slot below */
if (!npages) {
@@ -942,28 +856,16 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages);
int is_error_page(struct page *page)
{
- return page == bad_page || page == hwpoison_page || page == fault_page;
+ return page == bad_page;
}
EXPORT_SYMBOL_GPL(is_error_page);
int is_error_pfn(pfn_t pfn)
{
- return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
+ return pfn == bad_pfn;
}
EXPORT_SYMBOL_GPL(is_error_pfn);
-int is_hwpoison_pfn(pfn_t pfn)
-{
- return pfn == hwpoison_pfn;
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
-
-int is_fault_pfn(pfn_t pfn)
-{
- return pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_fault_pfn);
-
static inline unsigned long bad_hva(void)
{
return PAGE_OFFSET;
@@ -975,10 +877,10 @@ int kvm_is_error_hva(unsigned long addr)
}
EXPORT_SYMBOL_GPL(kvm_is_error_hva);
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
- gfn_t gfn)
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
{
int i;
+ struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
for (i = 0; i < slots->nmemslots; ++i) {
struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -989,18 +891,20 @@ static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
}
return NULL;
}
+EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
- return __gfn_to_memslot(kvm_memslots(kvm), gfn);
+ gfn = unalias_gfn(kvm, gfn);
+ return gfn_to_memslot_unaliased(kvm, gfn);
}
-EXPORT_SYMBOL_GPL(gfn_to_memslot);
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
int i;
- struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+ gfn = unalias_gfn_instantiation(kvm, gfn);
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -1042,9 +946,10 @@ out:
int memslot_id(struct kvm *kvm, gfn_t gfn)
{
int i;
- struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
struct kvm_memory_slot *memslot = NULL;
+ gfn = unalias_gfn(kvm, gfn);
for (i = 0; i < slots->nmemslots; ++i) {
memslot = &slots->memslots[i];
@@ -1056,179 +961,76 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
return memslot - slots->memslots;
}
-static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
- gfn_t *nr_pages)
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
+ struct kvm_memory_slot *slot;
+
+ gfn = unalias_gfn_instantiation(kvm, gfn);
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return bad_hva();
-
- if (nr_pages)
- *nr_pages = slot->npages - (gfn - slot->base_gfn);
-
- return gfn_to_hva_memslot(slot, gfn);
-}
-
-unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
-{
- return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
+ return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-static pfn_t get_fault_pfn(void)
-{
- get_page(fault_page);
- return fault_pfn;
-}
-
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
- bool *async, bool write_fault, bool *writable)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
{
struct page *page[1];
- int npages = 0;
+ int npages;
pfn_t pfn;
- /* we can do it either atomically or asynchronously, not both */
- BUG_ON(atomic && async);
-
- BUG_ON(!write_fault && !writable);
-
- if (writable)
- *writable = true;
+ might_sleep();
- if (atomic || async)
- npages = kvm___get_user_pages_fast(addr, 1, 1, page);
-
- if (unlikely(npages != 1) && !atomic) {
- might_sleep();
-
- if (writable)
- *writable = write_fault;
-
- npages = get_user_pages_fast(addr, 1, write_fault, page);
-
- /* map read fault as writable if possible */
- if (unlikely(!write_fault) && npages == 1) {
- struct page *wpage[1];
-
- npages = kvm___get_user_pages_fast(addr, 1, 1, wpage);
- if (npages == 1) {
- *writable = true;
- put_page(page[0]);
- page[0] = wpage[0];
- }
- npages = 1;
- }
- }
+ npages = get_user_pages_fast(addr, 1, 1, page);
if (unlikely(npages != 1)) {
struct vm_area_struct *vma;
- if (atomic)
- return get_fault_pfn();
-
down_read(&current->mm->mmap_sem);
- if (is_hwpoison_address(addr)) {
+ vma = find_vma(current->mm, addr);
+
+ if (vma == NULL || addr < vma->vm_start ||
+ !(vma->vm_flags & VM_PFNMAP)) {
up_read(&current->mm->mmap_sem);
- get_page(hwpoison_page);
- return page_to_pfn(hwpoison_page);
+ get_page(bad_page);
+ return page_to_pfn(bad_page);
}
- vma = find_vma_intersection(current->mm, addr, addr+1);
-
- if (vma == NULL)
- pfn = get_fault_pfn();
- else if ((vma->vm_flags & VM_PFNMAP)) {
- pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- BUG_ON(!kvm_is_mmio_pfn(pfn));
- } else {
- if (async && (vma->vm_flags & VM_WRITE))
- *async = true;
- pfn = get_fault_pfn();
- }
+ pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
up_read(&current->mm->mmap_sem);
+ BUG_ON(!kvm_is_mmio_pfn(pfn));
} else
pfn = page_to_pfn(page[0]);
return pfn;
}
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
-{
- return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
-}
-EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
-
-static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
- bool write_fault, bool *writable)
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{
unsigned long addr;
- if (async)
- *async = false;
-
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr)) {
get_page(bad_page);
return page_to_pfn(bad_page);
}
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
- async = NULL;
-#endif
- return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
-}
-
-pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
-{
- return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
-
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
- bool write_fault, bool *writable)
-{
- return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
-
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
-{
- return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
+ return hva_to_pfn(kvm, addr);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
-pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
- bool *writable)
+static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
- return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
+ return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+ return hva_to_pfn(kvm, addr);
}
-int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
- int nr_pages)
-{
- unsigned long addr;
- gfn_t entry;
-
- addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
- if (kvm_is_error_hva(addr))
- return -1;
-
- if (entry < nr_pages)
- return 0;
-
- return kvm___get_user_pages_fast(addr, nr_pages, 1, pages);
-}
-EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
-
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
@@ -1402,51 +1204,9 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
return 0;
}
-int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- gpa_t gpa)
-{
- struct kvm_memslots *slots = kvm_memslots(kvm);
- int offset = offset_in_page(gpa);
- gfn_t gfn = gpa >> PAGE_SHIFT;
-
- ghc->gpa = gpa;
- ghc->generation = slots->generation;
- ghc->memslot = __gfn_to_memslot(slots, gfn);
- ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
- if (!kvm_is_error_hva(ghc->hva))
- ghc->hva += offset;
- else
- return -EFAULT;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
-
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- void *data, unsigned long len)
-{
- struct kvm_memslots *slots = kvm_memslots(kvm);
- int r;
-
- if (slots->generation != ghc->generation)
- kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
-
- if (kvm_is_error_hva(ghc->hva))
- return -EFAULT;
-
- r = copy_to_user((void *)ghc->hva, data, len);
- if (r)
- return -EFAULT;
- mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
-
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
{
- return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
- offset, len);
+ return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
@@ -1469,24 +1229,24 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
}
EXPORT_SYMBOL_GPL(kvm_clear_guest);
-void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
- gfn_t gfn)
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
{
+ struct kvm_memory_slot *memslot;
+
+ gfn = unalias_gfn(kvm, gfn);
+ memslot = gfn_to_memslot_unaliased(kvm, gfn);
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
+ unsigned long *p = memslot->dirty_bitmap +
+ rel_gfn / BITS_PER_LONG;
+ int offset = rel_gfn % BITS_PER_LONG;
- generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
+ /* avoid RMW */
+ if (!generic_test_le_bit(offset, p))
+ generic___set_le_bit(offset, p);
}
}
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_memory_slot *memslot;
-
- memslot = gfn_to_memslot(kvm, gfn);
- mark_page_dirty_in_slot(kvm, memslot, gfn);
-}
-
/*
* The vCPU has executed a HLT instruction with in-kernel mode enabled.
*/
@@ -1498,7 +1258,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
if (kvm_arch_vcpu_runnable(vcpu)) {
- kvm_make_request(KVM_REQ_UNHALT, vcpu);
+ set_bit(KVM_REQ_UNHALT, &vcpu->requests);
break;
}
if (kvm_cpu_has_pending_timer(vcpu))
@@ -1558,7 +1318,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
static struct vm_operations_struct kvm_vcpu_vm_ops = {
- .fault = kvm_vcpu_fault,
+ .VMA_OPS_FAULT(fault) = VMA_OPS_FAULT_FUNC(kvm_vcpu_fault),
};
static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1580,9 +1340,6 @@ static struct file_operations kvm_vcpu_fops = {
.unlocked_ioctl = kvm_vcpu_ioctl,
.compat_ioctl = kvm_vcpu_ioctl,
.mmap = kvm_vcpu_mmap,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)
- .llseek = noop_llseek,
-#endif
};
/*
@@ -1590,7 +1347,7 @@ static struct file_operations kvm_vcpu_fops = {
*/
static int create_vcpu_fd(struct kvm_vcpu *vcpu)
{
- return kvm_anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
+ return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
}
/*
@@ -1672,25 +1429,12 @@ static long kvm_vcpu_ioctl(struct file *filp,
if (vcpu->kvm->mm != current->mm)
return -EIO;
-
-#if defined(CONFIG_S390) || defined(CONFIG_PPC)
- /*
- * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
- * so vcpu_load() would break it.
- */
- if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
- return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
-#endif
-
-
- vcpu_load(vcpu);
switch (ioctl) {
case KVM_RUN:
r = -EINVAL;
if (arg)
goto out;
r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
- trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
case KVM_GET_REGS: {
struct kvm_regs *kvm_regs;
@@ -1827,7 +1571,7 @@ out_free2:
goto out;
p = &sigset;
}
- r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
+ r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
break;
}
case KVM_GET_FPU: {
@@ -1862,7 +1606,6 @@ out_free2:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}
out:
- vcpu_put(vcpu);
kfree(fpu);
kfree(kvm_sregs);
return r;
@@ -1913,6 +1656,7 @@ static long kvm_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&zone, argp, sizeof zone))
goto out;
+ r = -ENXIO;
r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
if (r)
goto out;
@@ -1924,6 +1668,7 @@ static long kvm_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&zone, argp, sizeof zone))
goto out;
+ r = -ENXIO;
r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
if (r)
goto out;
@@ -2041,7 +1786,7 @@ static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
static struct vm_operations_struct kvm_vm_vm_ops = {
- .fault = kvm_vm_fault,
+ .VMA_OPS_FAULT(fault) = VMA_OPS_FAULT_FUNC(kvm_vm_fault),
};
static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
@@ -2057,31 +1802,21 @@ static struct file_operations kvm_vm_fops = {
.compat_ioctl = kvm_vm_compat_ioctl,
#endif
.mmap = kvm_vm_mmap,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)
- .llseek = noop_llseek,
-#endif
};
static int kvm_dev_ioctl_create_vm(void)
{
- int r;
+ int fd;
struct kvm *kvm;
kvm = kvm_create_vm();
if (IS_ERR(kvm))
return PTR_ERR(kvm);
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
- r = kvm_coalesced_mmio_init(kvm);
- if (r < 0) {
- kvm_put_kvm(kvm);
- return r;
- }
-#endif
- r = kvm_anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
- if (r < 0)
+ fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+ if (fd < 0)
kvm_put_kvm(kvm);
- return r;
+ return fd;
}
static long kvm_dev_ioctl_check_extension_generic(long arg)
@@ -2153,9 +1888,6 @@ out:
static struct file_operations kvm_chardev_ops = {
.unlocked_ioctl = kvm_dev_ioctl,
.compat_ioctl = kvm_dev_ioctl,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)
- .llseek = noop_llseek,
-#endif
};
static struct miscdevice kvm_dev = {
@@ -2164,7 +1896,7 @@ static struct miscdevice kvm_dev = {
&kvm_chardev_ops,
};
-static void hardware_enable_nolock(void *junk)
+static void hardware_enable(void *junk)
{
int cpu = raw_smp_processor_id();
int r;
@@ -2184,14 +1916,7 @@ static void hardware_enable_nolock(void *junk)
}
}
-static void hardware_enable(void *junk)
-{
- spin_lock(&kvm_lock);
- hardware_enable_nolock(junk);
- spin_unlock(&kvm_lock);
-}
-
-static void hardware_disable_nolock(void *junk)
+static void hardware_disable(void *junk)
{
int cpu = raw_smp_processor_id();
@@ -2201,20 +1926,13 @@ static void hardware_disable_nolock(void *junk)
kvm_arch_hardware_disable(NULL);
}
-static void hardware_disable(void *junk)
-{
- spin_lock(&kvm_lock);
- hardware_disable_nolock(junk);
- spin_unlock(&kvm_lock);
-}
-
static void hardware_disable_all_nolock(void)
{
BUG_ON(!kvm_usage_count);
kvm_usage_count--;
if (!kvm_usage_count)
- kvm_on_each_cpu(hardware_disable_nolock, NULL, 1);
+ kvm_on_each_cpu(hardware_disable, NULL, 1);
}
static void hardware_disable_all(void)
@@ -2233,7 +1951,7 @@ static int hardware_enable_all(void)
kvm_usage_count++;
if (kvm_usage_count == 1) {
atomic_set(&hardware_enable_failed, 0);
- kvm_on_each_cpu(hardware_enable_nolock, NULL, 1);
+ kvm_on_each_cpu(hardware_enable, NULL, 1);
if (atomic_read(&hardware_enable_failed)) {
hardware_disable_all_nolock();
@@ -2261,30 +1979,31 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
cpu);
hardware_disable(NULL);
break;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
- case CPU_STARTING:
-#else
+ case CPU_UP_CANCELED:
+ printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+ cpu);
+ smp_call_function_single(cpu, hardware_disable, NULL, 1);
+ break;
case CPU_ONLINE:
-#endif
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
cpu);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
- hardware_enable(NULL);
-#else
smp_call_function_single(cpu, hardware_enable, NULL, 1);
-#endif
break;
}
return NOTIFY_OK;
}
-asmlinkage void kvm_spurious_fault(void)
+asmlinkage void kvm_handle_fault_on_reboot(void)
{
+ if (kvm_rebooting)
+ /* spin while reset goes on */
+ while (true)
+ ;
/* Fault while not rebooting. We want the trace. */
BUG();
}
-EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
void *v)
@@ -2297,7 +2016,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
*/
printk(KERN_INFO "kvm: exiting hardware virtualization\n");
kvm_rebooting = true;
- kvm_on_each_cpu(hardware_disable_nolock, NULL, 1);
+ kvm_on_each_cpu(hardware_disable, NULL, 1);
return NOTIFY_OK;
}
@@ -2323,9 +2042,7 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, const void *val)
{
int i;
- struct kvm_io_bus *bus;
-
- bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+ struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
for (i = 0; i < bus->dev_count; i++)
if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
return 0;
@@ -2337,9 +2054,8 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, void *val)
{
int i;
- struct kvm_io_bus *bus;
+ struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
- bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
for (i = 0; i < bus->dev_count; i++)
if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
return 0;
@@ -2403,6 +2119,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
static struct notifier_block kvm_cpu_notifier = {
.notifier_call = kvm_cpu_hotplug,
+ .priority = 20, /* must be > scheduler priority */
};
static int __vm_stat_get(void *_offset, u64 *val)
@@ -2469,16 +2186,14 @@ static void kvm_exit_debug(void)
static int kvm_suspend(struct sys_device *dev, pm_message_t state)
{
if (kvm_usage_count)
- hardware_disable_nolock(NULL);
+ hardware_disable(NULL);
return 0;
}
static int kvm_resume(struct sys_device *dev)
{
- if (kvm_usage_count) {
- WARN_ON(spin_is_locked(&kvm_lock));
- hardware_enable_nolock(NULL);
- }
+ if (kvm_usage_count)
+ hardware_enable(NULL);
return 0;
}
@@ -2518,17 +2233,22 @@ static void kvm_sched_out(struct preempt_notifier *pn,
kvm_fire_urn();
}
-int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+int kvm_init(void *opaque, unsigned int vcpu_size,
struct module *module)
{
int r;
int cpu;
- r = kvm_init_srcu();
+ r = kvm_init_anon_inodes();
if (r)
return r;
+ r = kvm_init_srcu();
+ if (r)
+ goto cleanup_anon_inodes;
+
preempt_notifier_sys_init();
+ hrtimer_kallsyms_resolve();
r = kvm_arch_init(opaque);
if (r)
@@ -2543,24 +2263,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
bad_pfn = page_to_pfn(bad_page);
- hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (hwpoison_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- hwpoison_pfn = page_to_pfn(hwpoison_page);
-
- fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (fault_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- fault_pfn = page_to_pfn(fault_page);
-
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
@@ -2592,19 +2294,14 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
goto out_free_4;
/* A kmem cache lets us meet the alignment requirements of fx_save. */
- if (!vcpu_align)
- vcpu_align = __alignof__(struct kvm_vcpu);
- kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
+ kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
+ __alignof__(struct kvm_vcpu),
0, NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
goto out_free_5;
}
- r = kvm_async_pf_init();
- if (r)
- goto out_free;
-
kvm_chardev_ops.owner = module;
IF_ANON_INODES_DOES_REFCOUNTS( kvm_vm_fops.owner = module;)
IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;)
@@ -2612,7 +2309,7 @@ IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;)
r = misc_register(&kvm_dev);
if (r) {
printk(KERN_ERR "kvm: misc device register failed\n");
- goto out_unreg;
+ goto out_free;
}
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -2620,14 +2317,12 @@ IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;)
kvm_init_debug();
- printk("loaded kvm module (kvm-kmod-2.6.38-rc7)\n");
+ printk("loaded kvm module (kvm-kmod-2.6.34)\n");
kvm_clock_warn_suspend_bug();
return 0;
-out_unreg:
- kvm_async_pf_deinit();
out_free:
kmem_cache_destroy(kvm_vcpu_cache);
out_free_5:
@@ -2643,37 +2338,35 @@ out_free_1:
out_free_0a:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
- if (fault_page)
- __free_page(fault_page);
- if (hwpoison_page)
- __free_page(hwpoison_page);
__free_page(bad_page);
out:
kvm_arch_exit();
out_fail:
preempt_notifier_sys_exit();
kvm_exit_srcu();
+cleanup_anon_inodes:
+ kvm_exit_anon_inodes();
return r;
}
EXPORT_SYMBOL_GPL(kvm_init);
void kvm_exit(void)
{
+ tracepoint_synchronize_unregister();
kvm_exit_debug();
misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache);
- kvm_async_pf_deinit();
sysdev_unregister(&kvm_sysdev);
sysdev_class_unregister(&kvm_sysdev_class);
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
- kvm_on_each_cpu(hardware_disable_nolock, NULL, 1);
+ kvm_on_each_cpu(hardware_disable, NULL, 1);
kvm_arch_hardware_unsetup();
kvm_arch_exit();
free_cpumask_var(cpus_hardware_enabled);
- __free_page(hwpoison_page);
__free_page(bad_page);
preempt_notifier_sys_exit();
kvm_exit_srcu();
+ kvm_exit_anon_inodes();
}
EXPORT_SYMBOL_GPL(kvm_exit);
diff --git a/linux/x86/kvm_timer.h b/linux/x86/kvm_timer.h
index 64bc6ea..55c7524 100644
--- a/linux/x86/kvm_timer.h
+++ b/linux/x86/kvm_timer.h
@@ -10,7 +10,9 @@ struct kvm_timer {
};
struct kvm_timer_ops {
- bool (*is_periodic)(struct kvm_timer *);
+ bool (*is_periodic)(struct kvm_timer *);
};
+
enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
+
diff --git a/linux/x86/lapic.c b/linux/x86/lapic.c
index d3ddaa4..890f40f 100644
--- a/linux/x86/lapic.c
+++ b/linux/x86/lapic.c
@@ -45,7 +45,6 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2007 Novell
* Copyright (C) 2007 Intel
- * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Dor Laor <dor.laor@qumranet.com>
@@ -299,10 +298,9 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
static void apic_update_ppr(struct kvm_lapic *apic)
{
- u32 tpr, isrv, ppr, old_ppr;
+ u32 tpr, isrv, ppr;
int isr;
- old_ppr = apic_get_reg(apic, APIC_PROCPRI);
tpr = apic_get_reg(apic, APIC_TASKPRI);
isr = apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr : 0;
@@ -315,11 +313,7 @@ static void apic_update_ppr(struct kvm_lapic *apic)
apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
apic, ppr, isr, isrv);
- if (old_ppr != ppr) {
- apic_set_reg(apic, APIC_PROCPRI, ppr);
- if (ppr < old_ppr)
- kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
- }
+ apic_set_reg(apic, APIC_PROCPRI, ppr);
}
static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -374,7 +368,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
"dest_mode 0x%x, short_hand 0x%x\n",
target, source, dest, dest_mode, short_hand);
- ASSERT(target);
+ ASSERT(!target);
switch (short_hand) {
case APIC_DEST_NOSHORT:
if (dest_mode == 0)
@@ -436,7 +430,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
}
- kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
break;
@@ -462,7 +455,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
"INIT on a runnable vcpu %d\n",
vcpu->vcpu_id);
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
} else {
apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -477,7 +469,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
result = 1;
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
break;
@@ -523,7 +514,6 @@ static void apic_set_eoi(struct kvm_lapic *apic)
trigger_mode = IOAPIC_EDGE_TRIG;
if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
- kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
}
static void apic_send_ipi(struct kvm_lapic *apic)
@@ -583,7 +573,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_vcpu *vcpu = apic->vcpu;
struct kvm_run *run = vcpu->run;
- kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
+ set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
@@ -726,7 +716,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
apic->lapic_timer.period = NSEC_PER_MSEC/2;
}
- hrtimer_start(&apic->lapic_timer.timer,
+ hrtimer_start_p(&apic->lapic_timer.timer,
ktime_add_ns(now, apic->lapic_timer.period),
HRTIMER_MODE_ABS);
@@ -841,7 +831,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
case APIC_TMICT:
- hrtimer_cancel(&apic->lapic_timer.timer);
+ hrtimer_cancel_p(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
break;
@@ -913,7 +903,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
if (!vcpu->arch.apic)
return;
- hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
+ hrtimer_cancel_p(&vcpu->arch.apic->lapic_timer.timer);
if (vcpu->arch.apic->regs_page)
__free_page(vcpu->arch.apic->regs_page);
@@ -989,7 +979,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
ASSERT(apic != NULL);
/* Stop the timer in case it's a reset to an active apic */
- hrtimer_cancel(&apic->lapic_timer.timer);
+ hrtimer_cancel_p(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
kvm_apic_set_version(apic->vcpu);
@@ -1105,16 +1095,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
vcpu->arch.apic = apic;
- apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ apic->regs_page = alloc_page(GFP_KERNEL);
if (apic->regs_page == NULL) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
goto nomem_free_apic;
}
apic->regs = page_address(apic->regs_page);
+ memset(apic->regs, 0, PAGE_SIZE);
apic->vcpu = vcpu;
- hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
+ hrtimer_init_p(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
apic->lapic_timer.timer.function = kvm_timer_fn;
apic->lapic_timer.t_ops = &lapic_timer_ops;
@@ -1155,11 +1146,13 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
- if (!apic_hw_enabled(vcpu->arch.apic))
- r = 1;
- if ((lvt0 & APIC_LVT_MASKED) == 0 &&
- GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
- r = 1;
+ if (kvm_vcpu_is_bsp(vcpu)) {
+ if (!apic_hw_enabled(vcpu->arch.apic))
+ r = 1;
+ if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+ GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+ r = 1;
+ }
return r;
}
@@ -1196,11 +1189,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
- hrtimer_cancel(&apic->lapic_timer.timer);
+ hrtimer_cancel_p(&apic->lapic_timer.timer);
update_divide_count(apic);
start_apic_timer(apic);
apic->irr_pending = true;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1212,7 +1204,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
return;
timer = &apic->lapic_timer.timer;
- if (hrtimer_cancel(timer))
+ if (hrtimer_cancel_p(timer))
kvm_hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
diff --git a/linux/x86/mmu.c b/linux/x86/mmu.c
index 9713914..a1d4d0c 100644
--- a/linux/x86/mmu.c
+++ b/linux/x86/mmu.c
@@ -47,7 +47,6 @@
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -58,11 +57,9 @@
*
*/
-#include "irq.h"
#include "mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
-#include "x86.h"
#include <linux/kvm_host.h>
#include <asm/types.h>
@@ -75,7 +72,6 @@
#include <linux/srcu.h>
#include <linux/slab.h>
-#include <linux/uaccess.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -91,25 +87,15 @@
*/
bool tdp_enabled = false;
-enum {
- AUDIT_PRE_PAGE_FAULT,
- AUDIT_POST_PAGE_FAULT,
- AUDIT_PRE_PTE_WRITE,
- AUDIT_POST_PTE_WRITE,
- AUDIT_PRE_SYNC,
- AUDIT_POST_SYNC
-};
+#undef MMU_DEBUG
-char *audit_point_name[] = {
- "pre page fault",
- "post page fault",
- "pre pte write",
- "post pte write",
- "pre sync",
- "post sync"
-};
+#undef AUDIT
-#undef MMU_DEBUG
+#ifdef AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
+#endif
#ifdef MMU_DEBUG
@@ -123,7 +109,7 @@ char *audit_point_name[] = {
#endif
-#ifdef MMU_DEBUG
+#if defined(MMU_DEBUG) || defined(AUDIT)
static int dbg = 0;
module_param(dbg, bool, 0644);
#endif
@@ -141,11 +127,11 @@ module_param(oos_shadow, bool, 0644);
}
#endif
-#define PTE_PREFETCH_NUM 8
-
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
#define PT64_LEVEL_BITS 9
#define PT64_LEVEL_SHIFT(level) \
@@ -202,6 +188,7 @@ module_param(oos_shadow, bool, 0644);
#include <trace/events/kvm.h>
+#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
@@ -227,15 +214,20 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
+
+struct kvm_unsync_walk {
+ int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
+};
+
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
-static struct percpu_counter kvm_total_used_mmu_pages;
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
+static u64 __read_mostly shadow_base_present_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
@@ -254,6 +246,12 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
+void kvm_mmu_set_base_ptes(u64 base_pte)
+{
+ shadow_base_present_pte = base_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
+
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
@@ -265,7 +263,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-static bool is_write_protection(struct kvm_vcpu *vcpu)
+static int is_write_protection(struct kvm_vcpu *vcpu)
{
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}
@@ -329,70 +327,13 @@ static gfn_t pse36_gfn_delta(u32 gpte)
static void __set_spte(u64 *sptep, u64 spte)
{
- kvm_set_64bit(sptep, spte);
-}
-
-static u64 __xchg_spte(u64 *sptep, u64 new_spte)
-{
#ifdef CONFIG_X86_64
- return xchg(sptep, new_spte);
+ set_64bit((unsigned long *)sptep, spte);
#else
- u64 old_spte;
-
- do {
- old_spte = *sptep;
- } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
-
- return old_spte;
+ set_64bit((unsigned long long *)sptep, spte);
#endif
}
-static bool spte_has_volatile_bits(u64 spte)
-{
- if (!shadow_accessed_mask)
- return false;
-
- if (!is_shadow_present_pte(spte))
- return false;
-
- if ((spte & shadow_accessed_mask) &&
- (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
- return false;
-
- return true;
-}
-
-static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
-{
- return (old_spte & bit_mask) && !(new_spte & bit_mask);
-}
-
-static void update_spte(u64 *sptep, u64 new_spte)
-{
- u64 mask, old_spte = *sptep;
-
- WARN_ON(!is_rmap_spte(new_spte));
-
- new_spte |= old_spte & shadow_dirty_mask;
-
- mask = shadow_accessed_mask;
- if (is_writable_pte(old_spte))
- mask |= shadow_dirty_mask;
-
- if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
- __set_spte(sptep, new_spte);
- else
- old_spte = __xchg_spte(sptep, new_spte);
-
- if (!shadow_accessed_mask)
- return;
-
- if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
- if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
- kvm_set_pfn_dirty(spte_to_pfn(old_spte));
-}
-
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
@@ -409,11 +350,10 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
return 0;
}
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
- struct kmem_cache *cache)
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
{
while (mc->nobjs)
- kmem_cache_free(cache, mc->objects[--mc->nobjs]);
+ kfree(mc->objects[--mc->nobjs]);
}
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -427,6 +367,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
+ set_page_private(page, 0);
cache->objects[cache->nobjs++] = page_address(page);
}
return 0;
@@ -447,7 +388,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
- rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
+ rmap_desc_cache, 4);
if (r)
goto out;
r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -461,11 +402,10 @@ out:
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -486,7 +426,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
{
- kmem_cache_free(pte_chain_cache, pc);
+ kfree(pc);
}
static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -497,66 +437,53 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
{
- kmem_cache_free(rmap_desc_cache, rd);
-}
-
-static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
-{
- if (!sp->role.direct)
- return sp->gfns[index];
-
- return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
-}
-
-static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
-{
- if (sp->role.direct)
- BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
- else
- sp->gfns[index] = gfn;
+ kfree(rd);
}
/*
- * Return the pointer to the large page information for a given gfn,
- * handling slots that are not large page aligned.
+ * Return the pointer to the largepage write count for a given
+ * gfn, handling slots that are not large page aligned.
*/
-static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
+static int *slot_largepage_idx(gfn_t gfn,
+ struct kvm_memory_slot *slot,
+ int level)
{
unsigned long idx;
- idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
- (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
- return &slot->lpage_info[level - 2][idx];
+ idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ return &slot->lpage_info[level - 2][idx].write_count;
}
static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
+ int *write_count;
int i;
- slot = gfn_to_memslot(kvm, gfn);
+ gfn = unalias_gfn(kvm, gfn);
+
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- linfo = lpage_info_slot(gfn, slot, i);
- linfo->write_count += 1;
+ write_count = slot_largepage_idx(gfn, slot, i);
+ *write_count += 1;
}
}
static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
+ int *write_count;
int i;
- slot = gfn_to_memslot(kvm, gfn);
+ gfn = unalias_gfn(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- linfo = lpage_info_slot(gfn, slot, i);
- linfo->write_count -= 1;
- WARN_ON(linfo->write_count < 0);
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
+ write_count = slot_largepage_idx(gfn, slot, i);
+ *write_count -= 1;
+ WARN_ON(*write_count < 0);
}
}
@@ -565,12 +492,13 @@ static int has_wrprotected_page(struct kvm *kvm,
int level)
{
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
+ int *largepage_idx;
- slot = gfn_to_memslot(kvm, gfn);
+ gfn = unalias_gfn(kvm, gfn);
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
if (slot) {
- linfo = lpage_info_slot(gfn, slot, level);
- return linfo->write_count;
+ largepage_idx = slot_largepage_idx(gfn, slot, level);
+ return *largepage_idx;
}
return 1;
@@ -594,18 +522,14 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret;
}
-static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
struct kvm_memory_slot *slot;
+ int host_level, level, max_level;
+
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap)
- return true;
- return false;
-}
-
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
- int host_level, level, max_level;
+ return PT_PAGE_TABLE_LEVEL;
host_level = host_mapping_level(vcpu->kvm, large_gfn);
@@ -624,20 +548,22 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
/*
* Take gfn and return the reverse mapping to it.
+ * Note: gfn must be unaliased before this function get called
*/
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
{
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
+ unsigned long idx;
slot = gfn_to_memslot(kvm, gfn);
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- linfo = lpage_info_slot(gfn, slot, level);
+ idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
- return &linfo->rmap_pde;
+ return &slot->lpage_info[level - 2][idx].rmap_pde;
}
/*
@@ -662,8 +588,9 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
if (!is_rmap_spte(*spte))
return count;
+ gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
- kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+ sp->gfns[spte - sp->spt] = gfn;
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -674,7 +601,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
desc->sptes[0] = (u64 *)*rmapp;
desc->sptes[1] = spte;
*rmapp = (unsigned long)desc | 1;
- ++count;
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -687,7 +613,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
desc = desc->more;
}
for (i = 0; desc->sptes[i]; ++i)
- ++count;
+ ;
desc->sptes[i] = spte;
}
return count;
@@ -721,25 +647,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
struct kvm_mmu_page *sp;
- gfn_t gfn;
+ pfn_t pfn;
unsigned long *rmapp;
int i;
+ if (!is_rmap_spte(*spte))
+ return;
sp = page_header(__pa(spte));
- gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+ pfn = spte_to_pfn(*spte);
+ if (*spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (is_writable_pte(*spte))
+ kvm_set_pfn_dirty(pfn);
+ rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
if (!*rmapp) {
- printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
+ printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
} else if (!(*rmapp & 1)) {
- rmap_printk("rmap_remove: %p 1->0\n", spte);
+ rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
if ((u64 *)*rmapp != spte) {
- printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
+ printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
+ spte, *spte);
BUG();
}
*rmapp = 0;
} else {
- rmap_printk("rmap_remove: %p many->many\n", spte);
+ rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
prev_desc = NULL;
while (desc) {
@@ -753,41 +686,15 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
prev_desc = desc;
desc = desc->more;
}
- pr_err("rmap_remove: %p many->many\n", spte);
+ pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
BUG();
}
}
-static int set_spte_track_bits(u64 *sptep, u64 new_spte)
-{
- pfn_t pfn;
- u64 old_spte = *sptep;
-
- if (!spte_has_volatile_bits(old_spte))
- __set_spte(sptep, new_spte);
- else
- old_spte = __xchg_spte(sptep, new_spte);
-
- if (!is_rmap_spte(old_spte))
- return 0;
-
- pfn = spte_to_pfn(old_spte);
- if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
- kvm_set_pfn_dirty(pfn);
- return 1;
-}
-
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
-{
- if (set_spte_track_bits(sptep, new_spte))
- rmap_remove(kvm, sptep);
-}
-
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
struct kvm_rmap_desc *desc;
+ struct kvm_rmap_desc *prev_desc;
u64 *prev_spte;
int i;
@@ -799,6 +706,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
return NULL;
}
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ prev_desc = NULL;
prev_spte = NULL;
while (desc) {
for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
@@ -817,6 +725,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
u64 *spte;
int i, write_protected = 0;
+ gfn = unalias_gfn(kvm, gfn);
rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
spte = rmap_next(kvm, rmapp, NULL);
@@ -825,11 +734,18 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
- update_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
}
+ if (write_protected) {
+ pfn_t pfn;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ pfn = spte_to_pfn(*spte);
+ kvm_set_pfn_dirty(pfn);
+ }
/* check for huge page mappings */
for (i = PT_DIRECTORY_LEVEL;
@@ -842,9 +758,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
- drop_spte(kvm, spte,
- shadow_trap_nonpresent_pte);
+ rmap_remove(kvm, spte);
--kvm->stat.lpages;
+ __set_spte(spte, shadow_trap_nonpresent_pte);
spte = NULL;
write_protected = 1;
}
@@ -864,7 +780,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ rmap_remove(kvm, spte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
@@ -886,7 +803,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
- drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ rmap_remove(kvm, spte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
spte = rmap_next(kvm, rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -894,8 +812,9 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
- new_spte &= ~shadow_accessed_mask;
- set_spte_track_bits(spte, new_spte);
+ if (is_writable_pte(*spte))
+ kvm_set_pfn_dirty(spte_to_pfn(*spte));
+ __set_spte(spte, new_spte);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -915,7 +834,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
int retval = 0;
struct kvm_memslots *slots;
- slots = kvm_memslots(kvm);
+ slots = rcu_dereference(kvm->memslots);
for (i = 0; i < slots->nmemslots; i++) {
struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -925,16 +844,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- gfn_t gfn = memslot->base_gfn + gfn_offset;
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- struct kvm_lpage_info *linfo;
-
- linfo = lpage_info_slot(gfn, memslot,
- PT_DIRECTORY_LEVEL + j);
- ret |= handler(kvm, &linfo->rmap_pde, data);
+ int idx = gfn_offset;
+ idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+ ret |= handler(kvm,
+ &memslot->lpage_info[j][idx].rmap_pde,
+ data);
}
trace_kvm_age_page(hva, memslot, ret);
retval |= ret;
@@ -985,35 +903,6 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
return young;
}
-static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
-{
- u64 *spte;
- int young = 0;
-
- /*
- * If there's no access bit in the secondary pte set by the
- * hardware it's up to gup-fast/gup to set the access bit in
- * the primary pte or in the page structure.
- */
- if (!shadow_accessed_mask)
- goto out;
-
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- u64 _spte = *spte;
- BUG_ON(!(_spte & PT_PRESENT_MASK));
- young = _spte & PT_ACCESSED_MASK;
- if (young) {
- young = 1;
- break;
- }
- spte = rmap_next(kvm, rmapp, spte);
- }
-out:
- return young;
-}
-
#define RMAP_RECYCLE_THRESHOLD 1000
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1023,6 +912,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
sp = page_header(__pa(spte));
+ gfn = unalias_gfn(vcpu->kvm, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -1034,11 +924,6 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
- return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
-}
-
#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
{
@@ -1055,28 +940,14 @@ static int is_empty_shadow_page(u64 *spt)
}
#endif
-/*
- * This value is the sum of all of the kvm instances's
- * kvm->arch.n_used_mmu_pages values. We need a global,
- * aggregate version in order to make the slab shrinker
- * faster
- */
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
-{
- kvm->arch.n_used_mmu_pages += nr;
- percpu_counter_add(&kvm_total_used_mmu_pages, nr);
-}
-
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
- hlist_del(&sp->hash_link);
list_del(&sp->link);
__free_page(virt_to_page(sp->spt));
- if (!sp->role.direct)
- __free_page(virt_to_page(sp->gfns));
- kmem_cache_free(mmu_page_header_cache, sp);
- kvm_mod_used_mmu_pages(kvm, -1);
+ __free_page(virt_to_page(sp->gfns));
+ kfree(sp);
+ ++kvm->arch.n_free_mmu_pages;
}
static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1085,21 +956,20 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
}
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte, int direct)
+ u64 *parent_pte)
{
struct kvm_mmu_page *sp;
sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- if (!direct)
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
- PAGE_SIZE);
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&sp->oos_link);
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
sp->multimapped = 0;
sp->parent_pte = parent_pte;
- kvm_mod_used_mmu_pages(vcpu->kvm, +1);
+ --vcpu->kvm->arch.n_free_mmu_pages;
return sp;
}
@@ -1178,7 +1048,9 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
BUG();
}
-static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
@@ -1187,37 +1059,64 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
if (!sp->multimapped && sp->parent_pte) {
parent_sp = page_header(__pa(sp->parent_pte));
- fn(parent_sp, sp->parent_pte);
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
return;
}
-
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- u64 *spte = pte_chain->parent_ptes[i];
-
- if (!spte)
+ if (!pte_chain->parent_ptes[i])
break;
- parent_sp = page_header(__pa(spte));
- fn(parent_sp, spte);
+ parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
}
}
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
+static void kvm_mmu_update_unsync_bitmap(u64 *spte)
{
- mmu_parent_walk(sp, mark_unsync);
+ unsigned int index;
+ struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+ index = spte - sp->spt;
+ if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
+ sp->unsync_children++;
+ WARN_ON(!sp->unsync_children);
}
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
+static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
{
- unsigned int index;
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
- index = spte - sp->spt;
- if (__test_and_set_bit(index, sp->unsync_child_bitmap))
+ if (!sp->parent_pte)
return;
- if (sp->unsync_children++)
+
+ if (!sp->multimapped) {
+ kvm_mmu_update_unsync_bitmap(sp->parent_pte);
return;
- kvm_mmu_mark_parents_unsync(sp);
+ }
+
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
+ }
+}
+
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ kvm_mmu_update_parents_unsync(sp);
+ return 1;
+}
+
+static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+ kvm_mmu_update_parents_unsync(sp);
}
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1276,40 +1175,35 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
int i, ret, nr_unsync_leaf = 0;
for_each_unsync_children(sp->unsync_child_bitmap, i) {
- struct kvm_mmu_page *child;
u64 ent = sp->spt[i];
- if (!is_shadow_present_pte(ent) || is_large_pte(ent))
- goto clear_child_bitmap;
-
- child = page_header(ent & PT64_BASE_ADDR_MASK);
-
- if (child->unsync_children) {
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
-
- ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- goto clear_child_bitmap;
- else if (ret > 0)
- nr_unsync_leaf += ret;
- else
- return ret;
- } else if (child->unsync) {
- nr_unsync_leaf++;
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
- } else
- goto clear_child_bitmap;
-
- continue;
+ if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret)
+ __clear_bit(i, sp->unsync_child_bitmap);
+ else if (ret > 0)
+ nr_unsync_leaf += ret;
+ else
+ return ret;
+ }
-clear_child_bitmap:
- __clear_bit(i, sp->unsync_child_bitmap);
- sp->unsync_children--;
- WARN_ON((int)sp->unsync_children < 0);
+ if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ }
+ }
}
+ if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
+ sp->unsync_children = 0;
return nr_unsync_leaf;
}
@@ -1324,44 +1218,48 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
return __mmu_unsync_walk(sp, pvec);
}
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned index;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *sp;
+ struct hlist_node *node;
+
+ pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
+ index = kvm_page_table_hashfn(gfn);
+ bucket = &kvm->arch.mmu_page_hash[index];
+ hlist_for_each_entry(sp, node, bucket, hash_link)
+ if (sp->gfn == gfn && !sp->role.direct
+ && !sp->role.invalid) {
+ pgprintk("%s: found role %x\n",
+ __func__, sp->role.word);
+ return sp;
+ }
+ return NULL;
+}
+
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
- trace_kvm_mmu_sync_page(sp);
sp->unsync = 0;
--kvm->stat.mmu_unsync;
}
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- struct list_head *invalid_list);
-static void kvm_mmu_commit_zap_page(struct kvm *kvm,
- struct list_head *invalid_list);
-
-#define for_each_gfn_sp(kvm, sp, gfn, pos) \
- hlist_for_each_entry(sp, pos, \
- &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
- if ((sp)->gfn != (gfn)) {} else
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
-#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
- hlist_for_each_entry(sp, pos, \
- &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
- if ((sp)->gfn != (gfn) || (sp)->role.direct || \
- (sp)->role.invalid) {} else
-
-/* @sp->gfn should be write-protected at the call site */
-static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list, bool clear_unsync)
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- if (sp->role.cr4_pae != !!is_pae(vcpu)) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+ if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+ kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}
- if (clear_unsync)
- kvm_unlink_unsync_page(vcpu->kvm, sp);
-
+ trace_kvm_mmu_sync_page(sp);
+ if (rmap_write_protect(vcpu->kvm, sp->gfn))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+ kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}
@@ -1369,52 +1267,6 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return 0;
}
-static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- LIST_HEAD(invalid_list);
- int ret;
-
- ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
- if (ret)
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-
- return ret;
-}
-
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
-{
- return __kvm_sync_page(vcpu, sp, invalid_list, true);
-}
-
-/* @gfn should be write-protected at the call site */
-static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- struct kvm_mmu_page *s;
- struct hlist_node *node;
- LIST_HEAD(invalid_list);
- bool flush = false;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
- if (!s->unsync)
- continue;
-
- WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- kvm_unlink_unsync_page(vcpu->kvm, s);
- if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- (vcpu->arch.mmu.sync_page(vcpu, s))) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
- continue;
- }
- flush = true;
- }
-
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- if (flush)
- kvm_mmu_flush_tlb(vcpu);
-}
-
struct mmu_page_path {
struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1481,7 +1333,6 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp;
struct mmu_page_path parents;
struct kvm_mmu_pages pages;
- LIST_HEAD(invalid_list);
kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
@@ -1494,10 +1345,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
kvm_flush_remote_tlbs(vcpu->kvm);
for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp, &invalid_list);
+ kvm_sync_page(vcpu, sp);
mmu_pages_clear_parents(&parents);
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
cond_resched_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_pages_init(parent, &parents, &pages);
}
@@ -1512,57 +1362,50 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
u64 *parent_pte)
{
union kvm_mmu_page_role role;
+ unsigned index;
unsigned quadrant;
+ struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node;
- bool need_sync = false;
+ struct hlist_node *node, *tmp;
role = vcpu->arch.mmu.base_role;
role.level = level;
role.direct = direct;
- if (role.direct)
- role.cr4_pae = 0;
role.access = access;
- if (!vcpu->arch.mmu.direct_map
- && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
- if (!need_sync && sp->unsync)
- need_sync = true;
-
- if (sp->role.word != role.word)
- continue;
-
- if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
- break;
+ index = kvm_page_table_hashfn(gfn);
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+ hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
+ if (sp->gfn == gfn) {
+ if (sp->unsync)
+ if (kvm_sync_page(vcpu, sp))
+ continue;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
- kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- kvm_mmu_mark_parents_unsync(sp);
- } else if (sp->unsync)
- kvm_mmu_mark_parents_unsync(sp);
+ if (sp->role.word != role.word)
+ continue;
- trace_kvm_mmu_get_page(sp, false);
- return sp;
- }
+ mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ if (sp->unsync_children) {
+ set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+ }
+ trace_kvm_mmu_get_page(sp, false);
+ return sp;
+ }
++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
+ sp = kvm_mmu_alloc_page(vcpu, parent_pte);
if (!sp)
return sp;
sp->gfn = gfn;
sp->role = role;
- hlist_add_head(&sp->hash_link,
- &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
+ hlist_add_head(&sp->hash_link, bucket);
if (!direct) {
if (rmap_write_protect(vcpu->kvm, gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
- if (level > PT_PAGE_TABLE_LEVEL && need_sync)
- kvm_sync_pages(vcpu, gfn);
-
account_shadowed(vcpu->kvm, gfn);
}
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1579,12 +1422,6 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
iterator->addr = addr;
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
-
- if (iterator->level == PT64_ROOT_LEVEL &&
- vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
- !vcpu->arch.mmu.direct_map)
- --iterator->level;
-
if (iterator->level == PT32E_ROOT_LEVEL) {
iterator->shadow_addr
= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1615,47 +1452,6 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
--iterator->level;
}
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
-{
- u64 spte;
-
- spte = __pa(sp->spt)
- | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- __set_spte(sptep, spte);
-}
-
-static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
-{
- if (is_large_pte(*sptep)) {
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-}
-
-static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned direct_access)
-{
- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
- struct kvm_mmu_page *child;
-
- /*
- * For the direct sp, if the guest pte's dirty bit
- * changed form clean to dirty, it will corrupt the
- * sp's access: allow writable in the read-only sp,
- * so we should update the spte at this point to get
- * a new sp with the correct access.
- */
- child = page_header(*sptep & PT64_BASE_ADDR_MASK);
- if (child->role.access == direct_access)
- return;
-
- mmu_page_remove_parent_pte(child, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-}
-
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
@@ -1676,8 +1472,7 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
} else {
if (is_large_pte(ent))
--kvm->stat.lpages;
- drop_spte(kvm, &pt[i],
- shadow_trap_nonpresent_pte);
+ rmap_remove(kvm, &pt[i]);
}
}
pt[i] = shadow_trap_nonpresent_pte;
@@ -1719,8 +1514,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
static int mmu_zap_unsync_children(struct kvm *kvm,
- struct kvm_mmu_page *parent,
- struct list_head *invalid_list)
+ struct kvm_mmu_page *parent)
{
int i, zapped = 0;
struct mmu_page_path parents;
@@ -1734,7 +1528,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
struct kvm_mmu_page *sp;
for_each_sp(pages, sp, parents, i) {
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ kvm_mmu_zap_page(kvm, sp);
mmu_pages_clear_parents(&parents);
zapped++;
}
@@ -1744,113 +1538,110 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
return zapped;
}
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
int ret;
- trace_kvm_mmu_prepare_zap_page(sp);
+ trace_kvm_mmu_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
- ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
+ ret = mmu_zap_unsync_children(kvm, sp);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
+ kvm_flush_remote_tlbs(kvm);
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
- /* Count self */
- ret++;
- list_move(&sp->link, invalid_list);
+ hlist_del(&sp->hash_link);
+ kvm_mmu_free_page(kvm, sp);
} else {
+ sp->role.invalid = 1;
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
}
-
- sp->role.invalid = 1;
kvm_mmu_reset_last_pte_updated(kvm);
return ret;
}
-static void kvm_mmu_commit_zap_page(struct kvm *kvm,
- struct list_head *invalid_list)
-{
- struct kvm_mmu_page *sp;
-
- if (list_empty(invalid_list))
- return;
-
- kvm_flush_remote_tlbs(kvm);
-
- do {
- sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
- WARN_ON(!sp->role.invalid || sp->root_count);
- kvm_mmu_free_page(kvm, sp);
- } while (!list_empty(invalid_list));
-
-}
-
/*
* Changing the number of mmu pages allocated to the vm
- * Note: if goal_nr_mmu_pages is too small, you will get dead lock
+ * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
*/
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
- LIST_HEAD(invalid_list);
+ int used_pages;
+
+ used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
+ used_pages = max(0, used_pages);
+
/*
* If we set the number of mmu pages to be smaller be than the
* number of actived pages , we must to free some mmu pages before we
* change the value
*/
- if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
- while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
+ if (used_pages > kvm_nr_mmu_pages) {
+ while (used_pages > kvm_nr_mmu_pages &&
!list_empty(&kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ used_pages -= kvm_mmu_zap_page(kvm, page);
+ used_pages--;
}
- goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
+ kvm_nr_mmu_pages = used_pages;
+ kvm->arch.n_free_mmu_pages = 0;
}
+ else
+ kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
+ - kvm->arch.n_alloc_mmu_pages;
- kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+ kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
}
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
{
+ unsigned index;
+ struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node;
- LIST_HEAD(invalid_list);
+ struct hlist_node *node, *n;
int r;
- pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
+ pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
r = 0;
-
- for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
- pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
- sp->role.word);
- r = 1;
- kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- }
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ index = kvm_page_table_hashfn(gfn);
+ bucket = &kvm->arch.mmu_page_hash[index];
+ hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
+ if (sp->gfn == gfn && !sp->role.direct) {
+ pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+ sp->role.word);
+ r = 1;
+ if (kvm_mmu_zap_page(kvm, sp))
+ n = bucket->first;
+ }
return r;
}
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
{
+ unsigned index;
+ struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node;
- LIST_HEAD(invalid_list);
-
- for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
- pgprintk("%s: zap %llx %x\n",
- __func__, gfn, sp->role.word);
- kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ struct hlist_node *node, *nn;
+
+ index = kvm_page_table_hashfn(gfn);
+ bucket = &kvm->arch.mmu_page_hash[index];
+ hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
+ if (sp->gfn == gfn && !sp->role.direct
+ && !sp->role.invalid) {
+ pgprintk("%s: zap %lx %x\n",
+ __func__, gfn, sp->role.word);
+ if (kvm_mmu_zap_page(kvm, sp))
+ nn = bucket->first;
+ }
}
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1875,6 +1666,20 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
}
}
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct page *page;
+
+ gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
+
+ if (gpa == UNMAPPED_GVA)
+ return NULL;
+
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+ return page;
+}
+
/*
* The function is based on mtrr_type_lookup() in
* arch/x86/kernel/cpu/mtrr/generic.c
@@ -1980,51 +1785,47 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
+ unsigned index;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *s;
+ struct hlist_node *node, *n;
+
trace_kvm_mmu_unsync_page(sp);
+ index = kvm_page_table_hashfn(sp->gfn);
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+ /* don't unsync if pagetable is shadowed with multiple roles */
+ hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+ if (s->gfn != sp->gfn || s->role.direct)
+ continue;
+ if (s->role.word != sp->role.word)
+ return 1;
+ }
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
- kvm_mmu_mark_parents_unsync(sp);
- mmu_convert_notrap(sp);
-}
-
-static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- struct kvm_mmu_page *s;
- struct hlist_node *node;
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
- if (s->unsync)
- continue;
- WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- __kvm_unsync_page(vcpu, s);
- }
+ mmu_convert_notrap(sp);
+ return 0;
}
static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
bool can_unsync)
{
- struct kvm_mmu_page *s;
- struct hlist_node *node;
- bool need_unsync = false;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
- if (!can_unsync)
- return 1;
+ struct kvm_mmu_page *shadow;
- if (s->role.level != PT_PAGE_TABLE_LEVEL)
+ shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+ if (shadow) {
+ if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
return 1;
-
- if (!need_unsync && !s->unsync) {
- if (!oos_shadow)
- return 1;
- need_unsync = true;
- }
+ if (shadow->unsync)
+ return 0;
+ if (can_unsync && oos_shadow)
+ return kvm_unsync_page(vcpu, shadow);
+ return 1;
}
- if (need_unsync)
- kvm_unsync_pages(vcpu, gfn);
return 0;
}
@@ -2032,9 +1833,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync, bool host_writable)
+ bool can_unsync, bool reset_host_protection)
{
- u64 spte, entry = *sptep;
+ u64 spte;
int ret = 0;
/*
@@ -2042,7 +1843,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* whether the guest actually used the pte (in order to detect
* demand paging).
*/
- spte = PT_PRESENT_MASK;
+ spte = shadow_base_present_pte | shadow_dirty_mask;
if (!speculative)
spte |= shadow_accessed_mask;
if (!dirty)
@@ -2059,30 +1860,23 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
- if (host_writable)
+ if (reset_host_protection)
spte |= SPTE_HOST_WRITEABLE;
- else
- pte_access &= ~ACC_WRITE_MASK;
spte |= (u64)pfn << PAGE_SHIFT;
if ((pte_access & ACC_WRITE_MASK)
- || (!vcpu->arch.mmu.direct_map && write_fault
- && !is_write_protection(vcpu) && !user_fault)) {
+ || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
- goto done;
+ spte = shadow_trap_nonpresent_pte;
+ goto set_pte;
}
spte |= PT_WRITABLE_MASK;
- if (!vcpu->arch.mmu.direct_map
- && !(pte_access & ACC_WRITE_MASK))
- spte &= ~PT_USER_MASK;
-
/*
* Optimization: for pte sync, if spte was writable the hash
* lookup is unnecessary (and expensive). Write protection
@@ -2093,7 +1887,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
goto set_pte;
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
- pgprintk("%s: found shadow page for %llx, marking ro\n",
+ pgprintk("%s: found shadow page for %lx, marking ro\n",
__func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
@@ -2106,16 +1900,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- update_spte(sptep, spte);
- /*
- * If we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB.
- */
- if (is_writable_pte(entry) && !is_writable_pte(*sptep))
- kvm_flush_remote_tlbs(vcpu->kvm);
-done:
+ __set_spte(sptep, spte);
return ret;
}
@@ -2124,13 +1909,14 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int user_fault, int write_fault, int dirty,
int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
- bool host_writable)
+ bool reset_host_protection)
{
int was_rmapped = 0;
+ int was_writable = is_writable_pte(*sptep);
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %llx\n",
+ " user_fault %d gfn %lx\n",
__func__, *sptep, pt_access,
write_fault, user_fault, gfn);
@@ -2146,27 +1932,24 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
} else if (pfn != spte_to_pfn(*sptep)) {
- pgprintk("hfn old %llx new %llx\n",
+ pgprintk("hfn old %lx new %lx\n",
spte_to_pfn(*sptep), pfn);
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
+ rmap_remove(vcpu->kvm, sptep);
} else
was_rmapped = 1;
}
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
dirty, level, gfn, pfn, speculative, true,
- host_writable)) {
+ reset_host_protection)) {
if (write_fault)
*ptwrite = 1;
- kvm_mmu_flush_tlb(vcpu);
+ kvm_x86_ops->tlb_flush(vcpu);
}
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
+ pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
is_large_pte(*sptep)? "2MB" : "4kB",
*sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
*sptep, sptep);
@@ -2176,10 +1959,15 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
+ kvm_release_pfn_clean(pfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
+ } else {
+ if (was_writable)
+ kvm_release_pfn_dirty(pfn);
+ else
+ kvm_release_pfn_clean(pfn);
}
- kvm_release_pfn_clean(pfn);
if (speculative) {
vcpu->arch.last_pte_updated = sptep;
vcpu->arch.last_pte_gfn = gfn;
@@ -2190,108 +1978,8 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
-static struct kvm_memory_slot *
-pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
-{
- struct kvm_memory_slot *slot;
-
- slot = gfn_to_memslot(vcpu->kvm, gfn);
- if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
- (no_dirty_log && slot->dirty_bitmap))
- slot = NULL;
-
- return slot;
-}
-
-static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool no_dirty_log)
-{
- struct kvm_memory_slot *slot;
- unsigned long hva;
-
- slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
- if (!slot) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
- }
-
- hva = gfn_to_hva_memslot(slot, gfn);
-
- return hva_to_pfn_atomic(vcpu->kvm, hva);
-}
-
-static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *start, u64 *end)
-{
- struct page *pages[PTE_PREFETCH_NUM];
- unsigned access = sp->role.access;
- int i, ret;
- gfn_t gfn;
-
- gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
- if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
- return -1;
-
- ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
- if (ret <= 0)
- return -1;
-
- for (i = 0; i < ret; i++, gfn++, start++)
- mmu_set_spte(vcpu, start, ACC_ALL,
- access, 0, 0, 1, NULL,
- sp->role.level, gfn,
- page_to_pfn(pages[i]), true, true);
-
- return 0;
-}
-
-static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *sptep)
-{
- u64 *spte, *start = NULL;
- int i;
-
- WARN_ON(!sp->role.direct);
-
- i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
- spte = sp->spt + i;
-
- for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
- if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
- if (!start)
- continue;
- if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
- break;
- start = NULL;
- } else if (!start)
- start = spte;
- }
-}
-
-static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
-{
- struct kvm_mmu_page *sp;
-
- /*
- * Since it's no accessed bit on EPT, it's no way to
- * distinguish between actually accessed translations
- * and prefetched, so disable pte prefetch if EPT is
- * enabled.
- */
- if (!shadow_accessed_mask)
- return;
-
- sp = page_header(__pa(sptep));
- if (sp->role.level > PT_PAGE_TABLE_LEVEL)
- return;
-
- __direct_pte_prefetch(vcpu, sp, sptep);
-}
-
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int map_writable, int level, gfn_t gfn, pfn_t pfn,
- bool prefault)
+ int level, gfn_t gfn, pfn_t pfn)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -2300,21 +1988,15 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
- unsigned pte_access = ACC_ALL;
-
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
+ mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
- level, gfn, pfn, prefault, map_writable);
- direct_pte_prefetch(vcpu, iterator.sptep);
+ level, gfn, pfn, false, true);
++vcpu->stat.pf_fixed;
break;
}
if (*iterator.sptep == shadow_trap_nonpresent_pte) {
- u64 base_addr = iterator.addr;
-
- base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
- pseudo_gfn = base_addr >> PAGE_SHIFT;
+ pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
@@ -2327,126 +2009,45 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
__set_spte(iterator.sptep,
__pa(sp->spt)
| PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask
- | shadow_accessed_mask);
+ | shadow_user_mask | shadow_x_mask);
}
}
return pt_write;
}
-static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
-{
- kvm_siginfo_t info;
-
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_MCEERR_AR;
- info.si_addr = (void *)address;
- info.si_addr_lsb = PAGE_SHIFT;
-
- send_sig_info(SIGBUS, (siginfo_t *)&info, tsk);
-}
-
-static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
-{
- kvm_release_pfn_clean(pfn);
- if (is_hwpoison_pfn(pfn)) {
- kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
- return 0;
- } else if (is_fault_pfn(pfn))
- return -EFAULT;
-
- return 1;
-}
-
-static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
- gfn_t *gfnp, pfn_t *pfnp, int *levelp)
-{
- pfn_t pfn = *pfnp;
- gfn_t gfn = *gfnp;
- int level = *levelp;
-
- /*
- * Check if it's a transparent hugepage. If this would be an
- * hugetlbfs page, level wouldn't be set to
- * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
- * here.
- */
- if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
- level == PT_PAGE_TABLE_LEVEL &&
- PageTransCompound(pfn_to_page(pfn)) &&
- !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
- unsigned long mask;
- /*
- * mmu_notifier_retry was successful and we hold the
- * mmu_lock here, so the pmd can't become splitting
- * from under us, and in turn
- * __split_huge_page_refcount() can't run from under
- * us and we can safely transfer the refcount from
- * PG_tail to PG_head as we switch the pfn to tail to
- * head.
- */
- *levelp = level = PT_DIRECTORY_LEVEL;
- mask = KVM_PAGES_PER_HPAGE(level) - 1;
- VM_BUG_ON((gfn & mask) != (pfn & mask));
- if (pfn & mask) {
- gfn &= ~mask;
- *gfnp = gfn;
- kvm_release_pfn_clean(pfn);
- pfn &= ~mask;
- if (!get_page_unless_zero(pfn_to_page(pfn)))
- BUG();
- *pfnp = pfn;
- }
- }
-}
-
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable);
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
- bool prefault)
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
int level;
- int force_pt_level;
pfn_t pfn;
unsigned long mmu_seq;
- bool map_writable;
- force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
- if (likely(!force_pt_level)) {
- level = mapping_level(vcpu, gfn);
- /*
- * This path builds a PAE pagetable - so we can map
- * 2mb pages at maximum. Therefore check if the level
- * is larger than that.
- */
- if (level > PT_DIRECTORY_LEVEL)
- level = PT_DIRECTORY_LEVEL;
+ level = mapping_level(vcpu, gfn);
+
+ /*
+ * This path builds a PAE pagetable - so we can map 2mb pages at
+ * maximum. Therefore check if the level is larger than that.
+ */
+ if (level > PT_DIRECTORY_LEVEL)
+ level = PT_DIRECTORY_LEVEL;
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- } else
- level = PT_PAGE_TABLE_LEVEL;
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
-
- if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return 0;
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
/* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
- prefault);
+ r = __direct_map(vcpu, v, write, level, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2463,22 +2064,17 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
- LIST_HEAD(invalid_list);
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
spin_lock(&vcpu->kvm->mmu_lock);
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
- (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
- vcpu->arch.mmu.direct_map)) {
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
--sp->root_count;
- if (!sp->root_count && sp->role.invalid) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- }
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
@@ -2491,12 +2087,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
if (!sp->root_count && sp->role.invalid)
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
- &invalid_list);
+ kvm_mmu_zap_page(vcpu->kvm, sp);
}
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
@@ -2506,170 +2100,79 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
int ret = 0;
if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
ret = 1;
}
return ret;
}
-static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- unsigned i;
-
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
- 1, ACC_ALL, NULL);
- ++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = __pa(sp->spt);
- } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- ASSERT(!VALID_PAGE(root));
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
- i << 30,
- PT32_ROOT_LEVEL, 1, ACC_ALL,
- NULL);
- root = __pa(sp->spt);
- ++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
- }
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
- } else
- BUG();
-
- return 0;
-}
-
-static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu_page *sp;
- u64 pdptr, pm_mask;
- gfn_t root_gfn;
int i;
+ gfn_t root_gfn;
+ struct kvm_mmu_page *sp;
+ int direct = 0;
+ u64 pdptr;
- root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
-
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
+ root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
- /*
- * Do we shadow a long mode page table? If so we need to
- * write-protect the guests page table root.
- */
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
-
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
- 0, ACC_ALL, NULL);
+ if (tdp_enabled)
+ direct = 1;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+ PT64_ROOT_LEVEL, direct,
+ ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = root;
return 0;
}
-
- /*
- * We shadow a 32 bit page table. This may be a legacy 2-level
- * or a PAE 3-level page table. In either case we need to be aware that
- * the shadow page table may be a PAE or a long mode page table.
- */
- pm_mask = PT_PRESENT_MASK;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
- pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
-
+ direct = !is_paging(vcpu);
+ if (tdp_enabled)
+ direct = 1;
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
ASSERT(!VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
+ pdptr = kvm_pdptr_read(vcpu, i);
if (!is_present_gpte(pdptr)) {
vcpu->arch.mmu.pae_root[i] = 0;
continue;
}
root_gfn = pdptr >> PAGE_SHIFT;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
- }
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
+ } else if (vcpu->arch.mmu.root_level == 0)
+ root_gfn = 0;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, 0,
+ PT32_ROOT_LEVEL, direct,
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+ vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
-
- /*
- * If we shadow a 32 bit page table with a long mode page
- * table we enter this path.
- */
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- if (vcpu->arch.mmu.lm_root == NULL) {
- /*
- * The additional page necessary for this is only
- * allocated on demand.
- */
-
- u64 *lm_root;
-
- lm_root = (void*)get_zeroed_page(GFP_KERNEL);
- if (lm_root == NULL)
- return 1;
-
- lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
-
- vcpu->arch.mmu.lm_root = lm_root;
- }
-
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
- }
-
return 0;
}
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.mmu.direct_map)
- return mmu_alloc_direct_roots(vcpu);
- else
- return mmu_alloc_shadow_roots(vcpu);
-}
-
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
- if (vcpu->arch.mmu.direct_map)
- return;
-
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
-
- trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
- trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
return;
}
for (i = 0; i < 4; ++i) {
@@ -2681,7 +2184,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_children(vcpu, sp);
}
}
- trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
}
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2692,24 +2194,15 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, struct x86_exception *exception)
+ u32 access, u32 *error)
{
- if (exception)
- exception->error_code = 0;
+ if (error)
+ *error = 0;
return vaddr;
}
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access,
- struct x86_exception *exception)
-{
- if (exception)
- exception->error_code = 0;
- return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
-}
-
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code, bool prefault)
+ u32 error_code)
{
gfn_t gfn;
int r;
@@ -2725,68 +2218,17 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn, prefault);
-}
-
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
-{
- struct kvm_arch_async_pf arch;
-
- arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
- arch.gfn = gfn;
- arch.direct_map = vcpu->arch.mmu.direct_map;
- arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
-
- return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
-}
-
-static bool can_do_async_pf(struct kvm_vcpu *vcpu)
-{
- if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
- kvm_event_needs_reinjection(vcpu)))
- return false;
-
- return kvm_x86_ops->interrupt_allowed(vcpu);
-}
-
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable)
-{
- bool async;
-
- *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
-
- if (!async)
- return false; /* *pfn has correct page already */
-
- put_page(pfn_to_page(*pfn));
-
- if (!prefault && can_do_async_pf(vcpu)) {
- trace_kvm_try_async_get_page(gva, gfn);
- if (kvm_find_async_pf_gfn(vcpu, gfn)) {
- trace_kvm_async_pf_doublefault(gva, gfn);
- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
- return true;
- } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
- return true;
- }
-
- *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
-
- return false;
+ error_code & PFERR_WRITE_MASK, gfn);
}
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
- bool prefault)
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+ u32 error_code)
{
pfn_t pfn;
int r;
int level;
- int force_pt_level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
- int write = error_code & PFERR_WRITE_MASK;
- bool map_writable;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2795,30 +2237,23 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;
- force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
- if (likely(!force_pt_level)) {
- level = mapping_level(vcpu, gfn);
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- } else
- level = PT_PAGE_TABLE_LEVEL;
+ level = mapping_level(vcpu, gfn);
+
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
-
- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return 0;
-
- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable,
- level, gfn, pfn, prefault);
+ r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+ level, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -2834,9 +2269,10 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
}
-static int nonpaging_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static int nonpaging_init_context(struct kvm_vcpu *vcpu)
{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2847,32 +2283,26 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
- context->direct_map = true;
- context->nx = false;
return 0;
}
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ kvm_x86_ops->tlb_flush(vcpu);
}
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
- pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
+ pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
mmu_free_roots(vcpu);
}
-static unsigned long get_cr3(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr3(vcpu);
-}
-
static void inject_page_fault(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
+ u64 addr,
+ u32 err_code)
{
- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+ kvm_inject_page_fault(vcpu, addr, err_code);
}
static void paging_free(struct kvm_vcpu *vcpu)
@@ -2880,12 +2310,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
}
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
{
int bit7;
bit7 = (gpte >> 7) & 1;
- return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+ return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
}
#define PTTYPE 64
@@ -2896,33 +2326,26 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
#include "paging_tmpl.h"
#undef PTTYPE
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context,
- int level)
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
- if (!context->nx)
+ if (!is_nx(vcpu))
exb_bit_rsvd = rsvd_bits(63, 63);
switch (level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
context->rsvd_bits_mask[0][1] = 0;
context->rsvd_bits_mask[0][0] = 0;
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
-
- if (!is_pse(vcpu)) {
- context->rsvd_bits_mask[1][1] = 0;
- break;
- }
-
if (is_cpuid_PSE36())
/* 36bits PSE 4MB page */
context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
else
/* 32 bits PSE 4MB page */
context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
case PT32E_ROOT_LEVEL:
context->rsvd_bits_mask[0][2] =
@@ -2935,7 +2358,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62) |
rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2953,18 +2376,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
}
}
-static int paging64_init_context_common(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context,
- int level)
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
{
- context->nx = is_nx(vcpu);
-
- reset_rsvds_bits_mask(vcpu, context, level);
+ struct kvm_mmu *context = &vcpu->arch.mmu;
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
@@ -2977,23 +2396,20 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
context->root_level = level;
context->shadow_root_level = level;
context->root_hpa = INVALID_PAGE;
- context->direct_map = false;
return 0;
}
-static int paging64_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static int paging64_init_context(struct kvm_vcpu *vcpu)
{
- return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
+ reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+ return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
}
-static int paging32_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static int paging32_init_context(struct kvm_vcpu *vcpu)
{
- context->nx = false;
-
- reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+ reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
@@ -3004,21 +2420,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
- context->direct_map = false;
return 0;
}
-static int paging32E_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static int paging32E_init_context(struct kvm_vcpu *vcpu)
{
- return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
+ reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+ return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
}
static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = vcpu->arch.walk_mmu;
+ struct kvm_mmu *context = &vcpu->arch.mmu;
- context->base_role.word = 0;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
@@ -3027,29 +2441,20 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->invlpg = nonpaging_invlpg;
context->shadow_root_level = kvm_x86_ops->get_tdp_level();
context->root_hpa = INVALID_PAGE;
- context->direct_map = true;
- context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
- context->get_cr3 = get_cr3;
- context->inject_page_fault = kvm_inject_page_fault;
- context->nx = is_nx(vcpu);
if (!is_paging(vcpu)) {
- context->nx = false;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->root_level = 0;
} else if (is_long_mode(vcpu)) {
- context->nx = is_nx(vcpu);
- reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
+ reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT64_ROOT_LEVEL;
} else if (is_pae(vcpu)) {
- context->nx = is_nx(vcpu);
- reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
+ reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT32E_ROOT_LEVEL;
} else {
- context->nx = false;
- reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
+ reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
context->gva_to_gpa = paging32_gva_to_gpa;
context->root_level = PT32_ROOT_LEVEL;
}
@@ -3057,83 +2462,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
return 0;
}
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
int r;
+
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (!is_paging(vcpu))
- r = nonpaging_init_context(vcpu, context);
+ r = nonpaging_init_context(vcpu);
else if (is_long_mode(vcpu))
- r = paging64_init_context(vcpu, context);
+ r = paging64_init_context(vcpu);
else if (is_pae(vcpu))
- r = paging32E_init_context(vcpu, context);
+ r = paging32E_init_context(vcpu);
else
- r = paging32_init_context(vcpu, context);
+ r = paging32_init_context(vcpu);
- vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
- vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
+ vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
return r;
}
-EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
-{
- int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
-
- vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
- vcpu->arch.walk_mmu->get_cr3 = get_cr3;
- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-
- return r;
-}
-
-static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
-
- g_context->get_cr3 = get_cr3;
- g_context->inject_page_fault = kvm_inject_page_fault;
-
- /*
- * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
- * translation of l2_gpa to l1_gpa addresses is done using the
- * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
- * functions between mmu and nested_mmu are swapped.
- */
- if (!is_paging(vcpu)) {
- g_context->nx = false;
- g_context->root_level = 0;
- g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
- } else if (is_long_mode(vcpu)) {
- g_context->nx = is_nx(vcpu);
- reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
- g_context->root_level = PT64_ROOT_LEVEL;
- g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
- } else if (is_pae(vcpu)) {
- g_context->nx = is_nx(vcpu);
- reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
- g_context->root_level = PT32E_ROOT_LEVEL;
- g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
- } else {
- g_context->nx = false;
- reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
- g_context->root_level = PT32_ROOT_LEVEL;
- g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
- }
-
- return 0;
-}
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
vcpu->arch.update_pte.pfn = bad_pfn;
- if (mmu_is_nested(vcpu))
- return init_kvm_nested_mmu(vcpu);
- else if (tdp_enabled)
+ if (tdp_enabled)
return init_kvm_tdp_mmu(vcpu);
else
return init_kvm_softmmu(vcpu);
@@ -3142,9 +2496,10 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
- /* mmu.free() should set root_hpa = INVALID_PAGE */
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
vcpu->arch.mmu.free(vcpu);
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ }
}
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -3161,14 +2516,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
- r = mmu_alloc_roots(vcpu);
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ r = mmu_alloc_roots(vcpu);
mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
if (r)
goto out;
/* set_cr3() should ensure TLB has been flushed */
- vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+ kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
out:
return r;
}
@@ -3178,7 +2534,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_unload);
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
@@ -3190,7 +2545,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level))
- drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
+ rmap_remove(vcpu->kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
@@ -3212,7 +2567,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
}
++vcpu->kvm->stat.mmu_pte_updated;
- if (!sp->role.cr4_pae)
+ if (sp->role.glevels == PT32_ROOT_LEVEL)
paging32_update_pte(vcpu, sp, spte, new);
else
paging64_update_pte(vcpu, sp, spte, new);
@@ -3231,15 +2586,11 @@ static bool need_remote_flush(u64 old, u64 new)
return (old & ~new & PT64_PERM_MASK) != 0;
}
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
- bool remote_flush, bool local_flush)
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
{
- if (zap_page)
- return;
-
- if (remote_flush)
+ if (need_remote_flush(old, new))
kvm_flush_remote_tlbs(vcpu->kvm);
- else if (local_flush)
+ else
kvm_mmu_flush_tlb(vcpu);
}
@@ -3251,11 +2602,36 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
}
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- u64 gpte)
+ const u8 *new, int bytes)
{
gfn_t gfn;
+ int r;
+ u64 gpte = 0;
pfn_t pfn;
+ if (bytes != 4 && bytes != 8)
+ return;
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode. This is nearly always true
+ * (might be false while changing modes). Note it is verified later
+ * by update_pte().
+ */
+ if (is_pae(vcpu)) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ if ((bytes == 4) && (gpa % 4 == 0)) {
+ r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
+ if (r)
+ return;
+ memcpy((void *)&gpte + (gpa % 8), new, 4);
+ } else if ((bytes == 8) && (gpa % 8 == 0)) {
+ memcpy((void *)&gpte, new, 8);
+ }
+ } else {
+ if ((bytes == 4) && (gpa % 4 == 0))
+ memcpy((void *)&gpte, new, 4);
+ }
if (!is_present_gpte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -3289,10 +2665,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
bool guest_initiated)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
- union kvm_mmu_page_role mask = { .word = 0 };
struct kvm_mmu_page *sp;
- struct hlist_node *node;
- LIST_HEAD(invalid_list);
+ struct hlist_node *node, *n;
+ struct hlist_head *bucket;
+ unsigned index;
u64 entry, gentry;
u64 *spte;
unsigned offset = offset_in_page(gpa);
@@ -3304,53 +2680,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int flooded = 0;
int npte;
int r;
- int invlpg_counter;
- bool remote_flush, local_flush, zap_page;
-
- zap_page = remote_flush = local_flush = false;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-
- invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
-
- /*
- * Assume that the pte write on a page table of the same type
- * as the current vcpu paging mode. This is nearly always true
- * (might be false while changing modes). Note it is verified later
- * by update_pte().
- */
- if ((is_pae(vcpu) && bytes == 4) || !new) {
- /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
- if (is_pae(vcpu)) {
- gpa &= ~(gpa_t)7;
- bytes = 8;
- }
- r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
- if (r)
- gentry = 0;
- new = (const u8 *)&gentry;
- }
-
- switch (bytes) {
- case 4:
- gentry = *(const u32 *)new;
- break;
- case 8:
- gentry = *(const u64 *)new;
- break;
- default:
- gentry = 0;
- break;
- }
-
- mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
+ mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
spin_lock(&vcpu->kvm->mmu_lock);
- if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
- gentry = 0;
kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
- trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+ kvm_mmu_audit(vcpu, "pre pte write");
if (guest_initiated) {
if (gfn == vcpu->arch.last_pt_write_gfn
&& !last_updated_pte_accessed(vcpu)) {
@@ -3363,10 +2700,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
vcpu->arch.last_pte_updated = NULL;
}
}
-
- mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
- for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
- pte_size = sp->role.cr4_pae ? 8 : 4;
+ index = kvm_page_table_hashfn(gfn);
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+ hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
+ if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
+ continue;
+ pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
if (misaligned || flooded) {
@@ -3382,15 +2721,15 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
*/
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
- zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
- &invalid_list);
+ if (kvm_mmu_zap_page(vcpu->kvm, sp))
+ n = bucket->first;
++vcpu->kvm->stat.mmu_flooded;
continue;
}
page_offset = offset;
level = sp->role.level;
npte = 1;
- if (!sp->role.cr4_pae) {
+ if (sp->role.glevels == PT32_ROOT_LEVEL) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
@@ -3407,23 +2746,26 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (quadrant != sp->role.quadrant)
continue;
}
- local_flush = true;
spte = &sp->spt[page_offset / sizeof(*spte)];
+ if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
+ gentry = 0;
+ r = kvm_read_guest_atomic(vcpu->kvm,
+ gpa & ~(u64)(pte_size - 1),
+ &gentry, pte_size);
+ new = (const void *)&gentry;
+ if (r < 0)
+ new = NULL;
+ }
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- if (gentry &&
- !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
- & mask.word))
- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
- if (!remote_flush && need_remote_flush(entry, *spte))
- remote_flush = true;
+ if (new)
+ mmu_pte_write_new_pte(vcpu, sp, spte, new);
+ mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
}
- mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
+ kvm_mmu_audit(vcpu, "post pte write");
spin_unlock(&vcpu->kvm->mmu_lock);
if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -3436,7 +2778,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
- if (vcpu->arch.mmu.direct_map)
+ if (tdp_enabled)
return 0;
gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -3450,27 +2792,23 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- LIST_HEAD(invalid_list);
-
- while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
+ while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *sp;
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ kvm_mmu_zap_page(vcpu->kvm, sp);
++vcpu->kvm->stat.mmu_recycled;
}
}
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
- void *insn, int insn_len)
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
int r;
enum emulation_result er;
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
if (r < 0)
goto out;
@@ -3483,15 +2821,18 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
if (r)
goto out;
- er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
+ er = emulate_instruction(vcpu, cr2, error_code, 0);
switch (er) {
case EMULATE_DONE:
return 1;
case EMULATE_DO_MMIO:
++vcpu->stat.mmio_exits;
- /* fall through */
+ return 0;
case EMULATE_FAIL:
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
return 0;
default:
BUG();
@@ -3524,8 +2865,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu.pae_root);
- if (vcpu->arch.mmu.lm_root != NULL)
- free_page((unsigned long)vcpu->arch.mmu.lm_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -3567,6 +2906,15 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
return init_kvm_mmu(vcpu);
}
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+
+ destroy_kvm_mmu(vcpu);
+ free_mmu_pages(vcpu);
+ mmu_free_memory_caches(vcpu);
+}
+
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
@@ -3578,14 +2926,11 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (!test_bit(slot, sp->slot_bitmap))
continue;
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
-
pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
- if (is_writable_pte(pt[i]))
- update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
+ if (pt[i] & PT_WRITABLE_MASK)
+ pt[i] &= ~PT_WRITABLE_MASK;
}
kvm_flush_remote_tlbs(kvm);
}
@@ -3593,67 +2938,58 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
- LIST_HEAD(invalid_list);
spin_lock(&kvm->mmu_lock);
-restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
- goto restart;
-
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ if (kvm_mmu_zap_page(kvm, sp))
+ node = container_of(kvm->arch.active_mmu_pages.next,
+ struct kvm_mmu_page, link);
spin_unlock(&kvm->mmu_lock);
+
+ kvm_flush_remote_tlbs(kvm);
}
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
- struct list_head *invalid_list)
+static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
{
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
+ kvm_mmu_zap_page(kvm, page);
}
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)
-static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
-#else
static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
-#endif
{
struct kvm *kvm;
struct kvm *kvm_freed = NULL;
-
- if (nr_to_scan == 0)
- goto out;
+ int cache_count = 0;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- int idx, freed_pages;
- LIST_HEAD(invalid_list);
+ int npages, idx;
- idx = srcu_read_lock(&kvm->srcu);
+ idx = kvm_srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
- if (!kvm_freed && nr_to_scan > 0 &&
- kvm->arch.n_used_mmu_pages > 0) {
- freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
- &invalid_list);
+ npages = kvm->arch.n_alloc_mmu_pages -
+ kvm->arch.n_free_mmu_pages;
+ cache_count += npages;
+ if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
+ kvm_mmu_remove_one_alloc_mmu_page(kvm);
+ cache_count--;
kvm_freed = kvm;
}
nr_to_scan--;
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
+ kvm_srcu_read_unlock(&kvm->srcu, idx);
}
if (kvm_freed)
list_move_tail(&kvm_freed->vm_list, &vm_list);
spin_unlock(&kvm_lock);
-out:
- return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+ return cache_count;
}
static struct shrinker mmu_shrinker = {
@@ -3671,6 +3007,12 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+void kvm_mmu_module_exit(void)
+{
+ mmu_destroy_caches();
+ unregister_shrinker(&mmu_shrinker);
+}
+
int kvm_mmu_module_init(void)
{
pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3690,9 +3032,6 @@ int kvm_mmu_module_init(void)
if (!mmu_page_header_cache)
goto nomem;
- if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
- goto nomem;
-
register_shrinker(&mmu_shrinker);
return 0;
@@ -3712,8 +3051,7 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
unsigned int nr_pages = 0;
struct kvm_memslots *slots;
- slots = kvm_memslots(kvm);
-
+ slots = rcu_dereference(kvm->memslots);
for (i = 0; i < slots->nmemslots; i++)
nr_pages += slots->memslots[i].npages;
@@ -3767,7 +3105,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
- (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
+ kvm_set_cr3(vcpu, vcpu->arch.cr3);
return 1;
}
@@ -3863,25 +3201,272 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+#ifdef AUDIT
+
+static const char *audit_msg;
+
+static gva_t canonicalize(gva_t gva)
{
- ASSERT(vcpu);
+#ifdef CONFIG_X86_64
+ gva = (long long)(gva << 16) >> 16;
+#endif
+ return gva;
+}
- destroy_kvm_mmu(vcpu);
- free_mmu_pages(vcpu);
- mmu_free_memory_caches(vcpu);
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(kvm, child, fn);
+ } else
+ fn(kvm, sp, &sp->spt[i]);
+ }
+ }
}
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
-void kvm_mmu_module_exit(void)
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ }
+ }
+ return;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+ gva_t va, int level)
{
- mmu_destroy_caches();
- percpu_counter_destroy(&kvm_total_used_mmu_pages);
- unregister_shrinker(&mmu_shrinker);
- mmu_audit_disable();
+ u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+ int i;
+ gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+ u64 ent = pt[i];
+
+ if (ent == shadow_trap_nonpresent_pte)
+ continue;
+
+ va = canonicalize(va);
+ if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
+ audit_mappings_page(vcpu, ent, va, level - 1);
+ else {
+ gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
+
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ continue;
+ }
+
+ if (is_shadow_present_pte(ent)
+ && (ent & PT64_BASE_ADDR_MASK) != hpa)
+ printk(KERN_ERR "xx audit error: (%s) levels %d"
+ " gva %lx gpa %llx hpa %llx ent %llx %d\n",
+ audit_msg, vcpu->arch.mmu.root_level,
+ va, gpa, hpa, ent,
+ is_shadow_present_pte(ent));
+ else if (ent == shadow_notrap_nonpresent_pte
+ && !is_error_hpa(hpa))
+ printk(KERN_ERR "audit: (%s) notrap shadow,"
+ " valid guest gva %lx\n", audit_msg, va);
+ kvm_release_pfn_clean(pfn);
+
+ }
+ }
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+ unsigned i;
+
+ if (vcpu->arch.mmu.root_level == 4)
+ audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+ else
+ for (i = 0; i < 4; ++i)
+ if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+ audit_mappings_page(vcpu,
+ vcpu->arch.mmu.pae_root[i],
+ i << 30,
+ 2);
}
+
+static int count_rmaps(struct kvm_vcpu *vcpu)
+{
+ int nmaps = 0;
+ int i, j, k, idx;
+
+ idx = kvm_srcu_read_lock(&kvm->srcu);
+ slots = rcu_dereference(kvm->memslots);
+ for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+ struct kvm_memory_slot *m = &slots->memslots[i];
+ struct kvm_rmap_desc *d;
+
+ for (j = 0; j < m->npages; ++j) {
+ unsigned long *rmapp = &m->rmap[j];
+
+ if (!*rmapp)
+ continue;
+ if (!(*rmapp & 1)) {
+ ++nmaps;
+ continue;
+ }
+ d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ while (d) {
+ for (k = 0; k < RMAP_EXT; ++k)
+ if (d->sptes[k])
+ ++nmaps;
+ else
+ break;
+ d = d->more;
+ }
+ }
+ }
+ kvm_srcu_read_unlock(&kvm->srcu, idx);
+ return nmaps;
+}
+
+void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+{
+ unsigned long *rmapp;
+ struct kvm_mmu_page *rev_sp;
+ gfn_t gfn;
+
+ if (*sptep & PT_WRITABLE_MASK) {
+ rev_sp = page_header(__pa(sptep));
+ gfn = rev_sp->gfns[sptep - rev_sp->spt];
+
+ if (!gfn_to_memslot(kvm, gfn)) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no memslot for gfn %ld\n",
+ audit_msg, gfn);
+ printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
+ audit_msg, sptep - rev_sp->spt,
+ rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
+ is_large_pte(*sptep));
+ if (!*rmapp) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+ audit_msg, *sptep);
+ dump_stack();
+ }
+ }
+
+}
+
+void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ int i;
+
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+ u64 *pt = sp->spt;
+
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = pt[i];
+
+ if (!(ent & PT_PRESENT_MASK))
+ continue;
+ if (!(ent & PT_WRITABLE_MASK))
+ continue;
+ inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
+ }
+ }
+ return;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+ check_writable_mappings_rmap(vcpu);
+ count_rmaps(vcpu);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_memory_slot *slot;
+ unsigned long *rmapp;
+ u64 *spte;
+ gfn_t gfn;
+
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+ if (sp->role.direct)
+ continue;
+ if (sp->unsync)
+ continue;
+
+ gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+ slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
+ rmapp = &slot->rmap[gfn - slot->base_gfn];
+
+ spte = rmap_next(vcpu->kvm, rmapp, NULL);
+ while (spte) {
+ if (*spte & PT_WRITABLE_MASK)
+ printk(KERN_ERR "%s: (%s) shadow page has "
+ "writable mappings: gfn %lx role %x\n",
+ __func__, audit_msg, sp->gfn,
+ sp->role.word);
+ spte = rmap_next(vcpu->kvm, rmapp, spte);
+ }
+ }
+}
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+{
+ int olddbg = dbg;
+
+ dbg = 0;
+ audit_msg = msg;
+ audit_rmap(vcpu);
+ audit_write_protection(vcpu);
+ if (strcmp("pre pte write", audit_msg) != 0)
+ audit_mappings(vcpu);
+ audit_writable_sptes_have_rmaps(vcpu);
+ dbg = olddbg;
+}
+
+#endif
diff --git a/linux/x86/mmu.h b/linux/x86/mmu.h
index 7086ca8..be66759 100644
--- a/linux/x86/mmu.h
+++ b/linux/x86/mmu.h
@@ -49,17 +49,10 @@
#define PFERR_FETCH_MASK (1U << 4)
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-
-static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
-{
- return kvm->arch.n_max_mmu_pages -
- kvm->arch.n_used_mmu_pages;
-}
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
+ if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
__kvm_mmu_free_some_pages(vcpu);
}
diff --git a/linux/x86/mmu_audit.c b/linux/x86/mmu_audit.c
deleted file mode 100644
index 3b2d201..0000000
--- a/linux/x86/mmu_audit.c
+++ /dev/null
@@ -1,344 +0,0 @@
-#ifndef KVM_UNIFDEF_H
-#define KVM_UNIFDEF_H
-
-#ifdef __i386__
-#ifndef CONFIG_X86_32
-#define CONFIG_X86_32 1
-#endif
-#endif
-
-#ifdef __x86_64__
-#ifndef CONFIG_X86_64
-#define CONFIG_X86_64 1
-#endif
-#endif
-
-#if defined(__i386__) || defined (__x86_64__)
-#ifndef CONFIG_X86
-#define CONFIG_X86 1
-#endif
-#endif
-
-#ifdef __ia64__
-#ifndef CONFIG_IA64
-#define CONFIG_IA64 1
-#endif
-#endif
-
-#ifdef __PPC__
-#ifndef CONFIG_PPC
-#define CONFIG_PPC 1
-#endif
-#endif
-
-#ifdef __s390__
-#ifndef CONFIG_S390
-#define CONFIG_S390 1
-#endif
-#endif
-
-#endif
-/*
- * mmu_audit.c:
- *
- * Audit code for KVM MMU
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- * Marcelo Tosatti <mtosatti@redhat.com>
- * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/ratelimit.h>
-
-#define audit_printk(kvm, fmt, args...) \
- printk(KERN_ERR "audit: (%s) error: " \
- fmt, audit_point_name[kvm->arch.audit_point], ##args)
-
-typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
-
-static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- inspect_spte_fn fn, int level)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 *ent = sp->spt;
-
- fn(vcpu, ent + i, level);
-
- if (is_shadow_present_pte(ent[i]) &&
- !is_last_spte(ent[i], level)) {
- struct kvm_mmu_page *child;
-
- child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(vcpu, child, fn, level - 1);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
-
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
-
- sp = page_header(root);
- __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
- return;
- }
-
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- __mmu_spte_walk(vcpu, sp, fn, 2);
- }
- }
-
- return;
-}
-
-typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
-
-static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
- fn(kvm, sp);
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- pfn_t pfn;
- hpa_t hpa;
-
- sp = page_header(__pa(sptep));
-
- if (sp->unsync) {
- if (level != PT_PAGE_TABLE_LEVEL) {
- audit_printk(vcpu->kvm, "unsync sp: %p "
- "level = %d\n", sp, level);
- return;
- }
-
- if (*sptep == shadow_notrap_nonpresent_pte) {
- audit_printk(vcpu->kvm, "notrap spte in unsync "
- "sp: %p\n", sp);
- return;
- }
- }
-
- if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
- sp);
- return;
- }
-
- if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
- return;
-
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
-
- hpa = pfn << PAGE_SHIFT;
- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
- "ent %llxn", vcpu->arch.mmu.root_level, pfn,
- hpa, *sptep);
-}
-
-static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- unsigned long *rmapp;
- struct kvm_mmu_page *rev_sp;
- gfn_t gfn;
-
-
- rev_sp = page_header(__pa(sptep));
- gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
- if (!gfn_to_memslot(kvm, gfn)) {
- if (!printk_ratelimit())
- return;
- audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
- audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
- (long int)(sptep - rev_sp->spt), rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
- if (!*rmapp) {
- if (!printk_ratelimit())
- return;
- audit_printk(kvm, "no rmap for writable spte %llx\n",
- *sptep);
- dump_stack();
- }
-}
-
-static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
- inspect_spte_has_rmap(vcpu->kvm, sptep);
-}
-
-static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
-
- if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
- audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
- "root.\n", sp);
-}
-
-static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- int i;
-
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(sp->spt[i]))
- continue;
-
- inspect_spte_has_rmap(kvm, sp->spt + i);
- }
-}
-
-static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- struct kvm_memory_slot *slot;
- unsigned long *rmapp;
- u64 *spte;
-
- if (sp->role.direct || sp->unsync || sp->role.invalid)
- return;
-
- slot = gfn_to_memslot(kvm, sp->gfn);
- rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- if (is_writable_pte(*spte))
- audit_printk(kvm, "shadow page has writable "
- "mappings: gfn %llx role %x\n",
- sp->gfn, sp->role.word);
- spte = rmap_next(kvm, rmapp, spte);
- }
-}
-
-static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- check_mappings_rmap(kvm, sp);
- audit_write_protection(kvm, sp);
-}
-
-static void audit_all_active_sps(struct kvm *kvm)
-{
- walk_all_active_sps(kvm, audit_sp);
-}
-
-static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- audit_sptes_have_rmaps(vcpu, sptep, level);
- audit_mappings(vcpu, sptep, level);
- audit_spte_after_sync(vcpu, sptep, level);
-}
-
-static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, audit_spte);
-}
-
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
-
- if (!__ratelimit(&ratelimit_state))
- return;
-
- vcpu->kvm->arch.audit_point = point;
- audit_all_active_sps(vcpu->kvm);
- audit_vcpu_spte(vcpu);
-}
-
-static bool mmu_audit;
-
-static void mmu_audit_enable(void)
-{
- int ret;
-
- if (mmu_audit)
- return;
-
- ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
- WARN_ON(ret);
-
- mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
- if (!mmu_audit)
- return;
-
- unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
- tracepoint_synchronize_unregister();
- mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
- int ret;
- unsigned long enable;
-
- ret = strict_strtoul(val, 10, &enable);
- if (ret < 0)
- return -EINVAL;
-
- switch (enable) {
- case 0:
- mmu_audit_disable();
- break;
- case 1:
- mmu_audit_enable();
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static struct kernel_param_ops audit_param_ops = {
- .set = mmu_audit_set,
- .get = param_get_bool,
-};
-
-module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/linux/x86/mmutrace.h b/linux/x86/mmutrace.h
index b60b4fd..3e4a5c6 100644
--- a/linux/x86/mmutrace.h
+++ b/linux/x86/mmutrace.h
@@ -6,12 +6,14 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvmmmu
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE mmutrace
#define KVM_MMU_PAGE_FIELDS \
__field(__u64, gfn) \
__field(__u32, role) \
__field(__u32, root_count) \
- __field(bool, unsync)
+ __field(__u32, unsync)
#define KVM_MMU_PAGE_ASSIGN(sp) \
__entry->gfn = sp->gfn; \
@@ -28,14 +30,14 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \
+ trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \
" %snxe root %u %s%c", \
- __entry->gfn, role.level, \
- role.cr4_pae ? " pae" : "", \
+ __entry->gfn, role.level, role.glevels, \
role.quadrant, \
role.direct ? " direct" : "", \
access_str[role.access], \
role.invalid ? " invalid" : "", \
+ role.cr4_pge ? "" : "!", \
role.nxe ? "" : "!", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
@@ -92,15 +94,15 @@ TRACE_EVENT(
TP_printk("pte %llx level %u", __entry->pte, __entry->level)
);
-DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
-
+/* We set a pte accessed bit */
+TRACE_EVENT(
+ kvm_mmu_set_accessed_bit,
TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
-
TP_ARGS(table_gfn, index, size),
TP_STRUCT__entry(
__field(__u64, gpa)
- ),
+ ),
TP_fast_assign(
__entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
@@ -110,20 +112,22 @@ DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
TP_printk("gpa %llx", __entry->gpa)
);
-/* We set a pte accessed bit */
-DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
-
+/* We set a pte dirty bit */
+TRACE_EVENT(
+ kvm_mmu_set_dirty_bit,
TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+ TP_ARGS(table_gfn, index, size),
- TP_ARGS(table_gfn, index, size)
-);
-
-/* We set a pte dirty bit */
-DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
- TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
- TP_ARGS(table_gfn, index, size)
+ TP_printk("gpa %llx", __entry->gpa)
);
TRACE_EVENT(
@@ -162,64 +166,55 @@ TRACE_EVENT(
__entry->created ? "new" : "existing")
);
-DECLARE_EVENT_CLASS(kvm_mmu_page_class,
-
+TRACE_EVENT(
+ kvm_mmu_sync_page,
TP_PROTO(struct kvm_mmu_page *sp),
TP_ARGS(sp),
TP_STRUCT__entry(
KVM_MMU_PAGE_FIELDS
- ),
+ ),
TP_fast_assign(
KVM_MMU_PAGE_ASSIGN(sp)
- ),
+ ),
TP_printk("%s", KVM_MMU_PAGE_PRINTK())
);
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
- TP_PROTO(struct kvm_mmu_page *sp),
-
- TP_ARGS(sp)
-);
-
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
+TRACE_EVENT(
+ kvm_mmu_unsync_page,
TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
- TP_ARGS(sp)
-);
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
- TP_PROTO(struct kvm_mmu_page *sp),
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
- TP_ARGS(sp)
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
);
TRACE_EVENT(
- kvm_mmu_audit,
- TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
- TP_ARGS(vcpu, audit_point),
+ kvm_mmu_zap_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
TP_STRUCT__entry(
- __field(struct kvm_vcpu *, vcpu)
- __field(int, audit_point)
- ),
+ KVM_MMU_PAGE_FIELDS
+ ),
TP_fast_assign(
- __entry->vcpu = vcpu;
- __entry->audit_point = audit_point;
- ),
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
- TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
- audit_point_name[__entry->audit_point])
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
);
-#endif /* _TRACE_KVMMMU_H */
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE mmutrace
+#endif /* _TRACE_KVMMMU_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
diff --git a/linux/x86/paging_tmpl.h b/linux/x86/paging_tmpl.h
index 6bccc24..81eab9a 100644
--- a/linux/x86/paging_tmpl.h
+++ b/linux/x86/paging_tmpl.h
@@ -7,7 +7,6 @@
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -67,12 +66,11 @@ struct guest_walker {
int level;
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t ptes[PT_MAX_FULL_LEVELS];
- pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
- struct x86_exception fault;
+ u32 error_code;
};
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -105,7 +103,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
#if PTTYPE == 64
- if (vcpu->arch.mmu.nx)
+ if (is_nx(vcpu))
access &= ~(gpte >> PT64_NX_SHIFT);
#endif
return access;
@@ -114,42 +112,32 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
/*
* Fetch a guest pte for a guest virtual address
*/
-static int FNAME(walk_addr_generic)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t addr, u32 access)
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr,
+ int write_fault, int user_fault, int fetch_fault)
{
pt_element_t pte;
gfn_t table_gfn;
- unsigned index, pt_access, uninitialized_var(pte_access);
+ unsigned index, pt_access, pte_access;
gpa_t pte_gpa;
- bool eperm, present, rsvd_fault;
- int offset, write_fault, user_fault, fetch_fault;
-
- write_fault = access & PFERR_WRITE_MASK;
- user_fault = access & PFERR_USER_MASK;
- fetch_fault = access & PFERR_FETCH_MASK;
+ int rsvd_fault = 0;
trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
fetch_fault);
walk:
- present = true;
- eperm = rsvd_fault = false;
- walker->level = mmu->root_level;
- pte = mmu->get_cr3(vcpu);
-
+ walker->level = vcpu->arch.mmu.root_level;
+ pte = vcpu->arch.cr3;
#if PTTYPE == 64
- if (walker->level == PT32E_ROOT_LEVEL) {
- pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
+ if (!is_long_mode(vcpu)) {
+ pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte)) {
- present = false;
- goto error;
- }
+ if (!is_present_gpte(pte))
+ goto not_present;
--walker->level;
}
#endif
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
- (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
+ (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
pt_access = ACC_ALL;
@@ -157,49 +145,42 @@ walk:
index = PT_INDEX(addr, walker->level);
table_gfn = gpte_to_gfn(pte);
- offset = index * sizeof(pt_element_t);
- pte_gpa = gfn_to_gpa(table_gfn) + offset;
+ pte_gpa = gfn_to_gpa(table_gfn);
+ pte_gpa += index * sizeof(pt_element_t);
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
- offset, sizeof(pte),
- PFERR_USER_MASK|PFERR_WRITE_MASK)) {
- present = false;
- break;
- }
+ if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
+ goto not_present;
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte)) {
- present = false;
- break;
- }
+ if (!is_present_gpte(pte))
+ goto not_present;
- if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
- rsvd_fault = true;
- break;
- }
+ rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
+ if (rsvd_fault)
+ goto access_error;
if (write_fault && !is_writable_pte(pte))
if (user_fault || is_write_protection(vcpu))
- eperm = true;
+ goto access_error;
if (user_fault && !(pte & PT_USER_MASK))
- eperm = true;
+ goto access_error;
#if PTTYPE == 64
- if (fetch_fault && (pte & PT64_NX_MASK))
- eperm = true;
+ if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+ goto access_error;
#endif
- if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
+ if (!(pte & PT_ACCESSED_MASK)) {
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
sizeof(pte));
+ mark_page_dirty(vcpu->kvm, table_gfn);
if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
index, pte, pte|PT_ACCESSED_MASK))
goto walk;
- mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_ACCESSED_MASK;
}
@@ -209,32 +190,21 @@ walk:
if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
((walker->level == PT_DIRECTORY_LEVEL) &&
- is_large_pte(pte) &&
+ (pte & PT_PAGE_SIZE_MASK) &&
(PTTYPE == 64 || is_pse(vcpu))) ||
((walker->level == PT_PDPE_LEVEL) &&
- is_large_pte(pte) &&
- mmu->root_level == PT64_ROOT_LEVEL)) {
+ (pte & PT_PAGE_SIZE_MASK) &&
+ is_long_mode(vcpu))) {
int lvl = walker->level;
- gpa_t real_gpa;
- gfn_t gfn;
- u32 ac;
- gfn = gpte_to_gfn_lvl(pte, lvl);
- gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
+ walker->gfn = gpte_to_gfn_lvl(pte, lvl);
+ walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
+ >> PAGE_SHIFT;
if (PTTYPE == 32 &&
walker->level == PT_DIRECTORY_LEVEL &&
is_cpuid_PSE36())
- gfn += pse36_gfn_delta(pte);
-
- ac = write_fault | fetch_fault | user_fault;
-
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
- ac);
- if (real_gpa == UNMAPPED_GVA)
- return 0;
-
- walker->gfn = real_gpa >> PAGE_SHIFT;
+ walker->gfn += pse36_gfn_delta(pte);
break;
}
@@ -243,18 +213,15 @@ walk:
--walker->level;
}
- if (!present || eperm || rsvd_fault)
- goto error;
-
if (write_fault && !is_dirty_gpte(pte)) {
bool ret;
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+ mark_page_dirty(vcpu->kvm, table_gfn);
ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
pte|PT_DIRTY_MASK);
if (ret)
goto walk;
- mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_DIRTY_MASK;
walker->ptes[walker->level - 1] = pte;
}
@@ -262,71 +229,30 @@ walk:
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, pte_access, pt_access);
+ __func__, (u64)pte, pt_access, pte_access);
return 1;
-error:
- walker->fault.vector = PF_VECTOR;
- walker->fault.error_code_valid = true;
- walker->fault.error_code = 0;
- if (present)
- walker->fault.error_code |= PFERR_PRESENT_MASK;
+not_present:
+ walker->error_code = 0;
+ goto err;
- walker->fault.error_code |= write_fault | user_fault;
+access_error:
+ walker->error_code = PFERR_PRESENT_MASK;
- if (fetch_fault && mmu->nx)
- walker->fault.error_code |= PFERR_FETCH_MASK;
+err:
+ if (write_fault)
+ walker->error_code |= PFERR_WRITE_MASK;
+ if (user_fault)
+ walker->error_code |= PFERR_USER_MASK;
+ if (fetch_fault)
+ walker->error_code |= PFERR_FETCH_MASK;
if (rsvd_fault)
- walker->fault.error_code |= PFERR_RSVD_MASK;
-
- walker->fault.address = addr;
- walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
-
- trace_kvm_mmu_walker_error(walker->fault.error_code);
+ walker->error_code |= PFERR_RSVD_MASK;
+ trace_kvm_mmu_walker_error(walker->error_code);
return 0;
}
-static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr, u32 access)
-{
- return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
- access);
-}
-
-static int FNAME(walk_addr_nested)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr,
- u32 access)
-{
- return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
- addr, access);
-}
-
-static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- pt_element_t gpte)
-{
- u64 nonpresent = shadow_trap_nonpresent_pte;
-
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
- goto no_present;
-
- if (!is_present_gpte(gpte)) {
- if (!sp->unsync)
- nonpresent = shadow_notrap_nonpresent_pte;
- goto no_present;
- }
-
- if (!(gpte & PT_ACCESSED_MASK))
- goto no_present;
-
- return false;
-
-no_present:
- drop_spte(vcpu->kvm, spte, nonpresent);
- return true;
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
u64 *spte, const void *pte)
{
pt_element_t gpte;
@@ -334,11 +260,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pfn_t pfn;
gpte = *(const pt_element_t *)pte;
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
+ if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
+ if (!is_present_gpte(gpte))
+ __set_spte(spte, shadow_notrap_nonpresent_pte);
return;
-
+ }
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
return;
pfn = vcpu->arch.update_pte.pfn;
@@ -348,181 +276,91 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
kvm_get_pfn(pfn);
/*
- * we call mmu_set_spte() with host_writable = true beacuse that
+ * we call mmu_set_spte() with reset_host_protection = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
- mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
+ mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
+ gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
gpte_to_gfn(gpte), pfn, true, true);
}
-static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
- struct guest_walker *gw, int level)
-{
- pt_element_t curr_pte;
- gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
- u64 mask;
- int r, index;
-
- if (level == PT_PAGE_TABLE_LEVEL) {
- mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
- base_gpa = pte_gpa & ~mask;
- index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
-
- r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
- gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
- curr_pte = gw->prefetch_ptes[index];
- } else
- r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
- &curr_pte, sizeof(curr_pte));
-
- return r || curr_pte != gw->ptes[level - 1];
-}
-
-static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
- u64 *sptep)
-{
- struct kvm_mmu_page *sp;
- pt_element_t *gptep = gw->prefetch_ptes;
- u64 *spte;
- int i;
-
- sp = page_header(__pa(sptep));
-
- if (sp->role.level > PT_PAGE_TABLE_LEVEL)
- return;
-
- if (sp->role.direct)
- return __direct_pte_prefetch(vcpu, sp, sptep);
-
- i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
- spte = sp->spt + i;
-
- for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
- pt_element_t gpte;
- unsigned pte_access;
- gfn_t gfn;
- pfn_t pfn;
- bool dirty;
-
- if (spte == sptep)
- continue;
-
- if (*spte != shadow_trap_nonpresent_pte)
- continue;
-
- gpte = gptep[i];
-
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
- continue;
-
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- gfn = gpte_to_gfn(gpte);
- dirty = is_dirty_gpte(gpte);
- pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
- (pte_access & ACC_WRITE_MASK) && dirty);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- break;
- }
-
- mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
- pfn, true, true);
- }
-}
-
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *ptwrite, pfn_t pfn, bool map_writable,
- bool prefault)
+ int *ptwrite, pfn_t pfn)
{
unsigned access = gw->pt_access;
- struct kvm_mmu_page *sp = NULL;
- bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
- int top_level;
- unsigned direct_access;
- struct kvm_shadow_walk_iterator it;
+ struct kvm_mmu_page *shadow_page;
+ u64 spte, *sptep = NULL;
+ int direct;
+ gfn_t table_gfn;
+ int r;
+ int level;
+ pt_element_t curr_pte;
+ struct kvm_shadow_walk_iterator iterator;
if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;
- direct_access = gw->pt_access & gw->pte_access;
- if (!dirty)
- direct_access &= ~ACC_WRITE_MASK;
-
- top_level = vcpu->arch.mmu.root_level;
- if (top_level == PT32E_ROOT_LEVEL)
- top_level = PT32_ROOT_LEVEL;
- /*
- * Verify that the top-level gpte is still there. Since the page
- * is a root page, it is either write protected (and cannot be
- * changed from now on) or it is invalid (in which case, we don't
- * really care if it changes underneath us after this point).
- */
- if (FNAME(gpte_changed)(vcpu, gw, top_level))
- goto out_gpte_changed;
-
- for (shadow_walk_init(&it, vcpu, addr);
- shadow_walk_okay(&it) && it.level > gw->level;
- shadow_walk_next(&it)) {
- gfn_t table_gfn;
-
- drop_large_spte(vcpu, it.sptep);
-
- sp = NULL;
- if (!is_shadow_present_pte(*it.sptep)) {
- table_gfn = gw->table_gfn[it.level - 2];
- sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
- false, access, it.sptep);
+ for_each_shadow_entry(vcpu, addr, iterator) {
+ level = iterator.level;
+ sptep = iterator.sptep;
+ if (iterator.level == hlevel) {
+ mmu_set_spte(vcpu, sptep, access,
+ gw->pte_access & access,
+ user_fault, write_fault,
+ gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+ ptwrite, level,
+ gw->gfn, pfn, false, true);
+ break;
}
- /*
- * Verify that the gpte in the page we've just write
- * protected is still there.
- */
- if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
- goto out_gpte_changed;
-
- if (sp)
- link_shadow_page(it.sptep, sp);
- }
-
- for (;
- shadow_walk_okay(&it) && it.level > hlevel;
- shadow_walk_next(&it)) {
- gfn_t direct_gfn;
-
- validate_direct_spte(vcpu, it.sptep, direct_access);
-
- drop_large_spte(vcpu, it.sptep);
-
- if (is_shadow_present_pte(*it.sptep))
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
continue;
- direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
-
- sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
- true, direct_access, it.sptep);
- link_shadow_page(it.sptep, sp);
- }
+ if (is_large_pte(*sptep)) {
+ rmap_remove(vcpu->kvm, sptep);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
- mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
- user_fault, write_fault, dirty, ptwrite, it.level,
- gw->gfn, pfn, prefault, map_writable);
- FNAME(pte_prefetch)(vcpu, gw, it.sptep);
+ if (level <= gw->level) {
+ int delta = level - gw->level + 1;
+ direct = 1;
+ if (!is_dirty_gpte(gw->ptes[level - delta]))
+ access &= ~ACC_WRITE_MASK;
+ table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
+ /* advance table_gfn when emulating 1gb pages with 4k */
+ if (delta == 0)
+ table_gfn += PT_INDEX(addr, level);
+ } else {
+ direct = 0;
+ table_gfn = gw->table_gfn[level - 2];
+ }
+ shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+ direct, access, sptep);
+ if (!direct) {
+ r = kvm_read_guest_atomic(vcpu->kvm,
+ gw->pte_gpa[level - 2],
+ &curr_pte, sizeof(curr_pte));
+ if (r || curr_pte != gw->ptes[level - 2]) {
+ kvm_mmu_put_page(shadow_page, sptep);
+ kvm_release_pfn_clean(pfn);
+ sptep = NULL;
+ break;
+ }
+ }
- return it.sptep;
+ spte = __pa(shadow_page->spt)
+ | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ *sptep = spte;
+ }
-out_gpte_changed:
- if (sp)
- kvm_mmu_put_page(sp, it.sptep);
- kvm_release_pfn_clean(pfn);
- return NULL;
+ return sptep;
}
/*
@@ -539,22 +377,22 @@ out_gpte_changed:
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
- bool prefault)
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
+ u32 error_code)
{
int write_fault = error_code & PFERR_WRITE_MASK;
int user_fault = error_code & PFERR_USER_MASK;
+ int fetch_fault = error_code & PFERR_FETCH_MASK;
struct guest_walker walker;
u64 *sptep;
int write_pt = 0;
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
- int force_pt_level;
unsigned long mmu_seq;
- bool map_writable;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+ kvm_mmu_audit(vcpu, "pre page fault");
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -563,52 +401,41 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
/*
* Look up the guest pte for the faulting address.
*/
- r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
+ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
+ fetch_fault);
/*
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- if (!prefault) {
- inject_page_fault(vcpu, &walker.fault);
- /* reset fork detector */
- vcpu->arch.last_pt_write_count = 0;
- }
+ inject_page_fault(vcpu, addr, walker.error_code);
+ vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
return 0;
}
- if (walker.level >= PT_DIRECTORY_LEVEL)
- force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
- else
- force_pt_level = 1;
- if (!force_pt_level) {
+ if (walker.level >= PT_DIRECTORY_LEVEL) {
level = min(walker.level, mapping_level(vcpu, walker.gfn));
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
-
- if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
- &map_writable))
- return 0;
+ pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
/* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
+ if (is_error_pfn(pfn)) {
+ pgprintk("gfn %lx is mmio\n", walker.gfn);
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
-
- trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
- if (!force_pt_level)
- transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &write_pt, pfn, map_writable, prefault);
- (void)sptep;
+ level, &write_pt, pfn);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
sptep, *sptep, write_pt);
@@ -616,7 +443,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
++vcpu->stat.pf_fixed;
- trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
+ kvm_mmu_audit(vcpu, "post page fault (fixed)");
spin_unlock(&vcpu->kvm->mmu_lock);
return write_pt;
@@ -630,8 +457,6 @@ out_unlock:
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct kvm_shadow_walk_iterator iterator;
- struct kvm_mmu_page *sp;
- gpa_t pte_gpa = -1;
int level;
u64 *sptep;
int need_flush = 0;
@@ -642,83 +467,46 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
level = iterator.level;
sptep = iterator.sptep;
- sp = page_header(__pa(sptep));
- if (is_last_spte(*sptep, level)) {
- int offset, shift;
-
- if (!sp->unsync)
- break;
-
- shift = PAGE_SHIFT -
- (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
- offset = sp->role.quadrant << shift;
-
- pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
- pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+ if (level == PT_PAGE_TABLE_LEVEL ||
+ ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
+ ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
if (is_shadow_present_pte(*sptep)) {
+ rmap_remove(vcpu->kvm, sptep);
if (is_large_pte(*sptep))
--vcpu->kvm->stat.lpages;
- drop_spte(vcpu->kvm, sptep,
- shadow_trap_nonpresent_pte);
need_flush = 1;
- } else
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ }
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
break;
}
- if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
+ if (!is_shadow_present_pte(*sptep))
break;
}
if (need_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
-
- atomic_inc(&vcpu->kvm->arch.invlpg_counter);
-
spin_unlock(&vcpu->kvm->mmu_lock);
-
- if (pte_gpa == -1)
- return;
-
- if (mmu_topup_memory_caches(vcpu))
- return;
- kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
- struct x86_exception *exception)
+ u32 *error)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
+ r = FNAME(walk_addr)(&walker, vcpu, vaddr,
+ !!(access & PFERR_WRITE_MASK),
+ !!(access & PFERR_USER_MASK),
+ !!(access & PFERR_FETCH_MASK));
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
- } else if (exception)
- *exception = walker.fault;
-
- return gpa;
-}
-
-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access,
- struct x86_exception *exception)
-{
- struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
- int r;
-
- r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
-
- if (r) {
- gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
- } else if (exception)
- *exception = walker.fault;
+ } else if (error)
+ *error = walker.error_code;
return gpa;
}
@@ -757,68 +545,59 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
- *
- * Note:
- * We should flush all tlbs if spte is dropped even though guest is
- * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
- * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
- * used by guest then tlbs are not flushed, so guest is allowed to access the
- * freed pages.
- * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
+ * - Alias changes zap the entire shadow cache.
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
- bool host_writable;
- gpa_t first_pte_gpa;
+ bool reset_host_protection;
offset = nr_present = 0;
- /* direct kvm_mmu_page can not be unsync. */
- BUG_ON(sp->role.direct);
-
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
- first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
-
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
- gfn_t gfn;
+ gfn_t gfn = sp->gfns[i];
if (!is_shadow_present_pte(sp->spt[i]))
continue;
- pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
+ pte_gpa = gfn_to_gpa(sp->gfn);
+ pte_gpa += (i+offset) * sizeof(pt_element_t);
if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
sizeof(pt_element_t)))
return -EINVAL;
- gfn = gpte_to_gfn(gpte);
-
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- vcpu->kvm->tlbs_dirty++;
- continue;
- }
+ if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
+ !(gpte & PT_ACCESSED_MASK)) {
+ u64 nonpresent;
- if (gfn != sp->gfns[i]) {
- drop_spte(vcpu->kvm, &sp->spt[i],
- shadow_trap_nonpresent_pte);
- vcpu->kvm->tlbs_dirty++;
+ rmap_remove(vcpu->kvm, &sp->spt[i]);
+ if (is_present_gpte(gpte))
+ nonpresent = shadow_trap_nonpresent_pte;
+ else
+ nonpresent = shadow_notrap_nonpresent_pte;
+ __set_spte(&sp->spt[i], nonpresent);
continue;
}
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
-
+ if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
+ pte_access &= ~ACC_WRITE_MASK;
+ reset_host_protection = 0;
+ } else {
+ reset_host_protection = 1;
+ }
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
- host_writable);
+ reset_host_protection);
}
return !nr_present;
diff --git a/linux/x86/svm.c b/linux/x86/svm.c
index bd6a099..174aa10 100644
--- a/linux/x86/svm.c
+++ b/linux/x86/svm.c
@@ -44,7 +44,6 @@
* AMD SVM support
*
* Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -69,16 +68,14 @@
#include <linux/ftrace_event.h>
#include <linux/slab.h>
-#include <asm/tlbflush.h>
#include <asm/desc.h>
-#include <asm/kvm_para.h>
#include <asm/virtext.h>
#include "trace.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
-MODULE_INFO(version, "kvm-kmod-2.6.38-rc7");
+MODULE_INFO(version, "kvm-kmod-2.6.34");
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -88,15 +85,10 @@ MODULE_LICENSE("GPL");
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
-#define SVM_FEATURE_NPT (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_NRIP (1 << 3)
-#define SVM_FEATURE_TSC_RATE (1 << 4)
-#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
-#define SVM_FEATURE_FLUSH_ASID (1 << 6)
-#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
-#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
+#define SVM_FEATURE_NPT (1 << 0)
+#define SVM_FEATURE_LBRV (1 << 1)
+#define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -104,8 +96,6 @@ MODULE_LICENSE("GPL");
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
-static bool erratum_383_found __read_mostly;
-
static const u32 host_save_user_msrs[] = {
#ifdef CONFIG_X86_64
MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -121,7 +111,6 @@ struct kvm_vcpu;
struct nested_state {
struct vmcb *hsave;
u64 hsave_msr;
- u64 vm_cr_msr;
u64 vmcb;
/* These are the merged vectors */
@@ -129,32 +118,20 @@ struct nested_state {
/* gpa pointers to the real vectors */
u64 vmcb_msrpm;
- u64 vmcb_iopm;
/* A VMEXIT is required but not yet emulated */
bool exit_required;
- /*
- * If we vmexit during an instruction emulation we need this to restore
- * the l1 guest rip after the emulation
- */
- unsigned long vmexit_rip;
- unsigned long vmexit_rsp;
- unsigned long vmexit_rax;
-
/* cache for intercepts of the guest */
- u32 intercept_cr;
- u32 intercept_dr;
+ u16 intercept_cr_read;
+ u16 intercept_cr_write;
+ u16 intercept_dr_read;
+ u16 intercept_dr_write;
u32 intercept_exceptions;
u64 intercept;
- /* Nested Paging related state */
- u64 nested_cr3;
};
-#define MSRPM_OFFSETS 16
-static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
-
struct vcpu_svm {
struct kvm_vcpu vcpu;
struct vmcb *vmcb;
@@ -167,52 +144,20 @@ struct vcpu_svm {
u64 next_rip;
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- struct {
- u16 fs;
- u16 gs;
- u16 ldt;
- u64 gs_base;
- } host;
+ u64 host_gs_base;
u32 *msrpm;
struct nested_state nested;
bool nmi_singlestep;
-
- unsigned int3_injected;
- unsigned long int3_rip;
- u32 apf_reason;
-};
-
-#define MSR_INVALID 0xffffffffU
-
-static struct svm_direct_access_msrs {
- u32 index; /* Index of the MSR */
- bool always; /* True if intercept is always on */
-} direct_access_msrs[] = {
- { .index = MSR_STAR, .always = true },
- { .index = MSR_IA32_SYSENTER_CS, .always = true },
-#ifdef CONFIG_X86_64
- { .index = MSR_GS_BASE, .always = true },
- { .index = MSR_FS_BASE, .always = true },
- { .index = MSR_KERNEL_GS_BASE, .always = true },
- { .index = MSR_LSTAR, .always = true },
- { .index = MSR_CSTAR, .always = true },
- { .index = MSR_SYSCALL_MASK, .always = true },
-#endif
- { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
- { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
- { .index = MSR_IA32_LASTINTFROMIP, .always = false },
- { .index = MSR_IA32_LASTINTTOIP, .always = false },
- { .index = MSR_INVALID, .always = false },
};
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static bool npt_enabled = true;
#else
-static bool npt_enabled;
+static bool npt_enabled = false;
#endif
static int npt = 1;
@@ -225,156 +170,18 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
static int nested_svm_exit_handled(struct vcpu_svm *svm);
-static int nested_svm_intercept(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
-enum {
- VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
- pause filter count */
- VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
- VMCB_ASID, /* ASID */
- VMCB_INTR, /* int_ctl, int_vector */
- VMCB_NPT, /* npt_en, nCR3, gPAT */
- VMCB_CR, /* CR0, CR3, CR4, EFER */
- VMCB_DR, /* DR6, DR7 */
- VMCB_DT, /* GDT, IDT */
- VMCB_SEG, /* CS, DS, SS, ES, CPL */
- VMCB_CR2, /* CR2 only */
- VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
- VMCB_DIRTY_MAX,
-};
-
-/* TPR and CR2 are always written before VMRUN */
-#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
-
-static inline void mark_all_dirty(struct vmcb *vmcb)
-{
- vmcb->control.clean = 0;
-}
-
-static inline void mark_all_clean(struct vmcb *vmcb)
-{
- vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
- & ~VMCB_ALWAYS_DIRTY_MASK;
-}
-
-static inline void mark_dirty(struct vmcb *vmcb, int bit)
-{
- vmcb->control.clean &= ~(1 << bit);
-}
-
static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_svm, vcpu);
}
-static void recalc_intercepts(struct vcpu_svm *svm)
-{
- struct vmcb_control_area *c, *h;
- struct nested_state *g;
-
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-
- if (!is_guest_mode(&svm->vcpu))
- return;
-
- c = &svm->vmcb->control;
- h = &svm->nested.hsave->control;
- g = &svm->nested;
-
- c->intercept_cr = h->intercept_cr | g->intercept_cr;
- c->intercept_dr = h->intercept_dr | g->intercept_dr;
- c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
- c->intercept = h->intercept | g->intercept;
-}
-
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
-{
- if (is_guest_mode(&svm->vcpu))
- return svm->nested.hsave;
- else
- return svm->vmcb;
-}
-
-static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_cr |= (1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_cr &= ~(1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- return vmcb->control.intercept_cr & (1U << bit);
-}
-
-static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_dr |= (1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_dr &= ~(1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_exceptions |= (1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
+static inline bool is_nested(struct vcpu_svm *svm)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_exceptions &= ~(1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void set_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept |= (1ULL << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept &= ~(1ULL << bit);
-
- recalc_intercepts(svm);
+ return svm->nested.vmcb;
}
static inline void enable_gif(struct vcpu_svm *svm)
@@ -397,8 +204,8 @@ static unsigned long iopm_base;
struct kvm_ldttss_desc {
u16 limit0;
u16 base0;
- unsigned base1:8, type:5, dpl:2, p:1;
- unsigned limit1:4, zero0:3, g:1, base2:8;
+ unsigned base1 : 8, type : 5, dpl : 2, p : 1;
+ unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
u32 base3;
u32 zero1;
} __attribute__((packed));
@@ -428,29 +235,13 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
#define MSRS_RANGE_SIZE 2048
#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-static u32 svm_msrpm_offset(u32 msr)
-{
- u32 offset;
- int i;
-
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr < msrpm_ranges[i] ||
- msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
- continue;
-
- offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
- offset += (i * MSRS_RANGE_SIZE); /* add range offset */
-
- /* Now we have the u8 offset - but need the u32 offset */
- return offset / 4;
- }
+#define MAX_INST_SIZE 15
- /* MSR not in any range */
- return MSR_INVALID;
+static inline u32 svm_has(u32 feat)
+{
+ return svm_features & feat;
}
-#define MAX_INST_SIZE 15
-
static inline void clgi(void)
{
asm volatile (__ex(SVM_CLGI));
@@ -463,26 +254,43 @@ static inline void stgi(void)
static inline void invlpga(unsigned long addr, u32 asid)
{
- asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
+ asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
}
-static int get_npt_level(void)
+static inline void force_new_asid(struct kvm_vcpu *vcpu)
{
-#ifdef CONFIG_X86_64
- return PT64_ROOT_LEVEL;
-#else
- return PT32E_ROOT_LEVEL;
-#endif
+ to_svm(vcpu)->asid_generation--;
+}
+
+static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
+{
+ force_new_asid(vcpu);
}
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- vcpu->arch.efer = efer;
if (!npt_enabled && !(efer & EFER_LMA))
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
- mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+ vcpu->arch.efer = efer;
+}
+
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* If we are within a nested VM we'd better #VMEXIT and let the
+ guest handle the exception */
+ if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
+ return;
+
+ svm->vmcb->control.event_inj = nr
+ | SVM_EVTINJ_VALID
+ | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
+ svm->vmcb->control.event_inj_err = error_code;
}
static int is_external_interrupt(u32 info)
@@ -497,7 +305,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
u32 ret = 0;
if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
- ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
return ret & mask;
}
@@ -516,11 +324,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm->vmcb->control.next_rip != 0)
- svm->next_rip = svm->vmcb->control.next_rip;
-
if (!svm->next_rip) {
- if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
+ if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
EMULATE_DONE)
printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
@@ -533,67 +338,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
svm_set_interrupt_shadow(vcpu, 0);
}
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code,
- bool reinject)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- /*
- * If we are within a nested VM we'd better #VMEXIT and let the guest
- * handle the exception
- */
- if (!reinject &&
- nested_svm_check_exception(svm, nr, has_error_code, error_code))
- return;
-
- if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
- unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
-
- /*
- * For guest debugging where we have to reinject #BP if some
- * INT3 is guest-owned:
- * Emulate nRIP by moving RIP forward. Will fail if injection
- * raises a fault that is not intercepted. Still better than
- * failing in all cases.
- */
- skip_emulated_instruction(&svm->vcpu);
- rip = kvm_rip_read(&svm->vcpu);
- svm->int3_rip = rip + svm->vmcb->save.cs.base;
- svm->int3_injected = rip - old_rip;
- }
-
- svm->vmcb->control.event_inj = nr
- | SVM_EVTINJ_VALID
- | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
- | SVM_EVTINJ_TYPE_EXEPT;
- svm->vmcb->control.event_inj_err = error_code;
-}
-
-static void svm_init_erratum_383(void)
-{
- u32 low, high;
- int err;
- u64 val;
-
- if (!kvm_cpu_has_amd_erratum(kvm_amd_erratum_383))
- return;
-
- /* Use _safe variants to not break nested virtualization */
- val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
- if (err)
- return;
-
- val |= (1ULL << 47);
-
- low = lower_32_bits(val);
- high = upper_32_bits(val);
-
- kvm_native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
-
- erratum_383_found = true;
-}
-
static int has_svm(void)
{
const char *msg;
@@ -616,7 +360,7 @@ static int svm_hardware_enable(void *garbage)
struct svm_cpu_data *sd;
uint64_t efer;
- struct kvm_desc_ptr gdt_descr;
+ struct descriptor_table gdt_descr;
struct kvm_desc_struct *gdt;
int me = raw_smp_processor_id();
@@ -641,16 +385,14 @@ static int svm_hardware_enable(void *garbage)
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;
- kvm_native_store_gdt(&gdt_descr);
- gdt = (struct kvm_desc_struct *)gdt_descr.address;
+ kvm_get_gdt(&gdt_descr);
+ gdt = (struct kvm_desc_struct *)gdt_descr.base;
sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
wrmsrl(MSR_EFER, efer | EFER_SVME);
wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
- svm_init_erratum_383();
-
return 0;
}
@@ -690,98 +432,42 @@ err_1:
}
-static bool valid_msr_intercept(u32 index)
-{
- int i;
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
- if (direct_access_msrs[i].index == index)
- return true;
-
- return false;
-}
-
static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{
- u8 bit_read, bit_write;
- unsigned long tmp;
- u32 offset;
-
- /*
- * If this warning triggers extend the direct_access_msrs list at the
- * beginning of the file
- */
- WARN_ON(!valid_msr_intercept(msr));
-
- offset = svm_msrpm_offset(msr);
- bit_read = 2 * (msr & 0x0f);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
-
- BUG_ON(offset == MSR_INVALID);
-
- read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
- write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
-
- msrpm[offset] = tmp;
-}
-
-static void svm_vcpu_init_msrpm(u32 *msrpm)
-{
int i;
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- if (!direct_access_msrs[i].always)
- continue;
-
- set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
- }
-}
-
-static void add_msr_offset(u32 offset)
-{
- int i;
-
- for (i = 0; i < MSRPM_OFFSETS; ++i) {
-
- /* Offset already in list? */
- if (msrpm_offsets[i] == offset)
+ for (i = 0; i < NUM_MSR_MAPS; i++) {
+ if (msr >= msrpm_ranges[i] &&
+ msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
+ u32 msr_offset = (i * MSRS_IN_RANGE + msr -
+ msrpm_ranges[i]) * 2;
+
+ u32 *base = msrpm + (msr_offset / 32);
+ u32 msr_shift = msr_offset % 32;
+ u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
+ *base = (*base & ~(0x3 << msr_shift)) |
+ (mask << msr_shift);
return;
-
- /* Slot used by another offset? */
- if (msrpm_offsets[i] != MSR_INVALID)
- continue;
-
- /* Add offset to list */
- msrpm_offsets[i] = offset;
-
- return;
+ }
}
-
- /*
- * If this BUG triggers the msrpm_offsets table has an overflow. Just
- * increase MSRPM_OFFSETS in this case.
- */
BUG();
}
-static void init_msrpm_offsets(void)
+static void svm_vcpu_init_msrpm(u32 *msrpm)
{
- int i;
-
- memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 offset;
-
- offset = svm_msrpm_offset(direct_access_msrs[i].index);
- BUG_ON(offset == MSR_INVALID);
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
- add_msr_offset(offset);
- }
+#ifdef CONFIG_X86_64
+ set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
+ set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
+ set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
+ set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
+ set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
+ set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
+#endif
+ set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
}
static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -822,8 +508,6 @@ static __init int svm_hardware_setup(void)
memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
- init_msrpm_offsets();
-
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
@@ -832,7 +516,7 @@ static __init int svm_hardware_setup(void)
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ kvm_enable_efer_bits(EFER_SVME);
}
for_each_possible_cpu(cpu) {
@@ -843,7 +527,7 @@ static __init int svm_hardware_setup(void)
svm_features = cpuid_edx(SVM_CPUID_FUNC);
- if (!boot_cpu_has(X86_FEATURE_NPT))
+ if (!svm_has(SVM_FEATURE_NPT))
npt_enabled = false;
if (npt_enabled && !npt) {
@@ -880,7 +564,7 @@ static void init_seg(struct vmcb_seg *seg)
{
seg->selector = 0;
seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
- SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+ SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
seg->limit = 0xffff;
seg->base = 0;
}
@@ -893,97 +577,72 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 g_tsc_offset = 0;
-
- if (is_guest_mode(vcpu)) {
- g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->nested.hsave->control.tsc_offset;
- svm->nested.hsave->control.tsc_offset = offset;
- }
-
- svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
-
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-}
-
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.tsc_offset += adjustment;
- if (is_guest_mode(vcpu))
- svm->nested.hsave->control.tsc_offset += adjustment;
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-}
-
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
svm->vcpu.fpu_active = 1;
- svm->vcpu.arch.hflags = 0;
- set_cr_intercept(svm, INTERCEPT_CR0_READ);
- set_cr_intercept(svm, INTERCEPT_CR3_READ);
- set_cr_intercept(svm, INTERCEPT_CR4_READ);
- set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
-
- set_dr_intercept(svm, INTERCEPT_DR0_READ);
- set_dr_intercept(svm, INTERCEPT_DR1_READ);
- set_dr_intercept(svm, INTERCEPT_DR2_READ);
- set_dr_intercept(svm, INTERCEPT_DR3_READ);
- set_dr_intercept(svm, INTERCEPT_DR4_READ);
- set_dr_intercept(svm, INTERCEPT_DR5_READ);
- set_dr_intercept(svm, INTERCEPT_DR6_READ);
- set_dr_intercept(svm, INTERCEPT_DR7_READ);
-
- set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
-
- set_exception_intercept(svm, PF_VECTOR);
- set_exception_intercept(svm, UD_VECTOR);
- set_exception_intercept(svm, MC_VECTOR);
-
- set_intercept(svm, INTERCEPT_INTR);
- set_intercept(svm, INTERCEPT_NMI);
- set_intercept(svm, INTERCEPT_SMI);
- set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
- set_intercept(svm, INTERCEPT_CPUID);
- set_intercept(svm, INTERCEPT_INVD);
- set_intercept(svm, INTERCEPT_HLT);
- set_intercept(svm, INTERCEPT_INVLPG);
- set_intercept(svm, INTERCEPT_INVLPGA);
- set_intercept(svm, INTERCEPT_IOIO_PROT);
- set_intercept(svm, INTERCEPT_MSR_PROT);
- set_intercept(svm, INTERCEPT_TASK_SWITCH);
- set_intercept(svm, INTERCEPT_SHUTDOWN);
- set_intercept(svm, INTERCEPT_VMRUN);
- set_intercept(svm, INTERCEPT_VMMCALL);
- set_intercept(svm, INTERCEPT_VMLOAD);
- set_intercept(svm, INTERCEPT_VMSAVE);
- set_intercept(svm, INTERCEPT_STGI);
- set_intercept(svm, INTERCEPT_CLGI);
- set_intercept(svm, INTERCEPT_SKINIT);
- set_intercept(svm, INTERCEPT_WBINVD);
- set_intercept(svm, INTERCEPT_MONITOR);
- set_intercept(svm, INTERCEPT_MWAIT);
- set_intercept(svm, INTERCEPT_XSETBV);
+ control->intercept_cr_read = INTERCEPT_CR0_MASK |
+ INTERCEPT_CR3_MASK |
+ INTERCEPT_CR4_MASK;
+
+ control->intercept_cr_write = INTERCEPT_CR0_MASK |
+ INTERCEPT_CR3_MASK |
+ INTERCEPT_CR4_MASK |
+ INTERCEPT_CR8_MASK;
+
+ control->intercept_dr_read = INTERCEPT_DR0_MASK |
+ INTERCEPT_DR1_MASK |
+ INTERCEPT_DR2_MASK |
+ INTERCEPT_DR3_MASK |
+ INTERCEPT_DR4_MASK |
+ INTERCEPT_DR5_MASK |
+ INTERCEPT_DR6_MASK |
+ INTERCEPT_DR7_MASK;
+
+ control->intercept_dr_write = INTERCEPT_DR0_MASK |
+ INTERCEPT_DR1_MASK |
+ INTERCEPT_DR2_MASK |
+ INTERCEPT_DR3_MASK |
+ INTERCEPT_DR4_MASK |
+ INTERCEPT_DR5_MASK |
+ INTERCEPT_DR6_MASK |
+ INTERCEPT_DR7_MASK;
+
+ control->intercept_exceptions = (1 << PF_VECTOR) |
+ (1 << UD_VECTOR) |
+ (1 << MC_VECTOR);
+
+
+ control->intercept = (1ULL << INTERCEPT_INTR) |
+ (1ULL << INTERCEPT_NMI) |
+ (1ULL << INTERCEPT_SMI) |
+ (1ULL << INTERCEPT_SELECTIVE_CR0) |
+ (1ULL << INTERCEPT_CPUID) |
+ (1ULL << INTERCEPT_INVD) |
+ (1ULL << INTERCEPT_HLT) |
+ (1ULL << INTERCEPT_INVLPG) |
+ (1ULL << INTERCEPT_INVLPGA) |
+ (1ULL << INTERCEPT_IOIO_PROT) |
+ (1ULL << INTERCEPT_MSR_PROT) |
+ (1ULL << INTERCEPT_TASK_SWITCH) |
+ (1ULL << INTERCEPT_SHUTDOWN) |
+ (1ULL << INTERCEPT_VMRUN) |
+ (1ULL << INTERCEPT_VMMCALL) |
+ (1ULL << INTERCEPT_VMLOAD) |
+ (1ULL << INTERCEPT_VMSAVE) |
+ (1ULL << INTERCEPT_STGI) |
+ (1ULL << INTERCEPT_CLGI) |
+ (1ULL << INTERCEPT_SKINIT) |
+ (1ULL << INTERCEPT_WBINVD) |
+ (1ULL << INTERCEPT_MONITOR) |
+ (1ULL << INTERCEPT_MWAIT);
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __pa(svm->msrpm);
+ control->tsc_offset = 0;
control->int_ctl = V_INTR_MASKING_MASK;
init_seg(&save->es);
@@ -1011,19 +670,18 @@ static void init_vmcb(struct vcpu_svm *svm)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- svm_set_efer(&svm->vcpu, 0);
+ save->efer = EFER_SVME;
save->dr6 = 0xffff0ff0;
save->dr7 = 0x400;
save->rflags = 2;
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
- /*
- * This is the guest-visible cr0 value.
+ /* This is the guest-visible cr0 value.
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
*/
- svm->vcpu.arch.cr0 = 0;
- (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+ svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+ kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
@@ -1031,27 +689,25 @@ static void init_vmcb(struct vcpu_svm *svm)
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl = 1;
- clr_intercept(svm, INTERCEPT_TASK_SWITCH);
- clr_intercept(svm, INTERCEPT_INVLPG);
- clr_exception_intercept(svm, PF_VECTOR);
- clr_cr_intercept(svm, INTERCEPT_CR3_READ);
- clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
+ (1ULL << INTERCEPT_INVLPG));
+ control->intercept_exceptions &= ~(1 << PF_VECTOR);
+ control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
+ control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
save->g_pat = 0x0007040600070406ULL;
save->cr3 = 0;
save->cr4 = 0;
}
- svm->asid_generation = 0;
+ force_new_asid(&svm->vcpu);
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
- if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
control->pause_filter_count = 3000;
- set_intercept(svm, INTERCEPT_PAUSE);
+ control->intercept |= (1ULL << INTERCEPT_PAUSE);
}
- mark_all_dirty(svm->vmcb);
-
enable_gif(svm);
}
@@ -1114,27 +770,20 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm_vcpu_init_msrpm(svm->msrpm);
svm->nested.msrpm = page_address(nested_msrpm_pages);
- svm_vcpu_init_msrpm(svm->nested.msrpm);
svm->vmcb = page_address(page);
clear_page(svm->vmcb);
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
init_vmcb(svm);
- kvm_write_tsc(&svm->vcpu, 0);
-
- err = fx_init(&svm->vcpu);
- if (err)
- goto free_page4;
+ fx_init(&svm->vcpu);
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
return &svm->vcpu;
-free_page4:
- __free_page(hsave_page);
free_page3:
__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
@@ -1167,17 +816,23 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
int i;
if (unlikely(cpu != vcpu->cpu)) {
+ u64 delta;
+
+ if (kvm_check_tsc_unstable()) {
+ /*
+ * Make sure that the guest sees a monotonically
+ * increasing TSC.
+ */
+ delta = vcpu->arch.host_tsc - kvm_native_read_tsc();
+ svm->vmcb->control.tsc_offset += delta;
+ if (is_nested(svm))
+ svm->nested.hsave->control.tsc_offset += delta;
+ }
+ vcpu->cpu = cpu;
+ kvm_migrate_timers(vcpu);
svm->asid_generation = 0;
- mark_all_dirty(svm->vmcb);
}
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
-#endif
- savesegment(fs, svm->host.fs);
- savesegment(gs, svm->host.gs);
- svm->host.ldt = kvm_read_ldt();
-
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
@@ -1188,16 +843,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
int i;
++vcpu->stat.host_state_reload;
- kvm_load_ldt(svm->host.ldt);
-#ifdef CONFIG_X86_64
- loadsegment(fs, svm->host.fs);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
- load_gs_index(svm->host.gs);
-#else
- loadsegment(gs, svm->host.gs);
-#endif
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ vcpu->arch.host_tsc = kvm_native_read_tsc();
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1215,7 +864,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
switch (reg) {
case VCPU_EXREG_PDPTR:
BUG_ON(!npt_enabled);
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ load_pdptrs(vcpu, vcpu->arch.cr3);
break;
default:
BUG();
@@ -1224,12 +873,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
static void svm_set_vintr(struct vcpu_svm *svm)
{
- set_intercept(svm, INTERCEPT_VINTR);
+ svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
}
static void svm_clear_vintr(struct vcpu_svm *svm)
{
- clr_intercept(svm, INTERCEPT_VINTR);
+ svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
}
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1274,8 +923,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
- /*
- * AMD's VMCB does not have an explicit unusable field, so emulate it
+ /* AMD's VMCB does not have an explicit unusable field, so emulate it
* for cross vendor migration purposes by "not present"
*/
var->unusable = !var->present || (var->type == 0);
@@ -1311,8 +959,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->type |= 0x1;
break;
case VCPU_SREG_SS:
- /*
- * On AMD CPUs sometimes the DB bit in the segment
+ /* On AMD CPUs sometimes the DB bit in the segment
* descriptor is left as 1, although the whole segment has
* been made unusable. Clear it here to pass an Intel VMX
* entry check when cross vendor migrating.
@@ -1330,48 +977,42 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
return save->cpl;
}
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- dt->size = svm->vmcb->save.idtr.limit;
- dt->address = svm->vmcb->save.idtr.base;
+ dt->limit = svm->vmcb->save.idtr.limit;
+ dt->base = svm->vmcb->save.idtr.base;
}
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.idtr.limit = dt->size;
- svm->vmcb->save.idtr.base = dt->address ;
- mark_dirty(svm->vmcb, VMCB_DT);
+ svm->vmcb->save.idtr.limit = dt->limit;
+ svm->vmcb->save.idtr.base = dt->base ;
}
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- dt->size = svm->vmcb->save.gdtr.limit;
- dt->address = svm->vmcb->save.gdtr.base;
+ dt->limit = svm->vmcb->save.gdtr.limit;
+ dt->base = svm->vmcb->save.gdtr.base;
}
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.gdtr.limit = dt->size;
- svm->vmcb->save.gdtr.base = dt->address ;
- mark_dirty(svm->vmcb, VMCB_DT);
+ svm->vmcb->save.gdtr.limit = dt->limit;
+ svm->vmcb->save.gdtr.base = dt->base ;
}
static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
}
-static void svm_decache_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
}
@@ -1387,14 +1028,13 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
| (gcr0 & SVM_CR0_SELECTIVE_MASK);
- mark_dirty(svm->vmcb, VMCB_CR);
if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
- clr_cr_intercept(svm, INTERCEPT_CR0_READ);
- clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+ svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
} else {
- set_cr_intercept(svm, INTERCEPT_CR0_READ);
- set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
+ svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
}
}
@@ -1402,31 +1042,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_guest_mode(vcpu)) {
- /*
- * We are here because we run in nested mode, the host kvm
- * intercepts cr0 writes but the l1 hypervisor does not.
- * But the L1 hypervisor may intercept selective cr0 writes.
- * This needs to be checked here.
- */
- unsigned long old, new;
-
- /* Remove bits that would trigger a real cr0 write intercept */
- old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
- new = cr0 & SVM_CR0_SELECTIVE_MASK;
-
- if (old == new) {
- /* cr0 write with ts and mp unchanged */
- svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
- if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
- svm->nested.vmexit_rip = kvm_rip_read(vcpu);
- svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- return;
- }
- }
- }
-
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1454,7 +1069,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
*/
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
- mark_dirty(svm->vmcb, VMCB_CR);
update_cr0_intercept(svm);
}
@@ -1464,14 +1078,13 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- svm_flush_tlb(vcpu);
+ force_new_asid(vcpu);
vcpu->arch.cr4 = cr4;
if (!npt_enabled)
cr4 |= X86_CR4_PAE;
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
- mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1500,25 +1113,26 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
= (svm->vmcb->save.cs.attrib
>> SVM_SELECTOR_DPL_SHIFT) & 3;
- mark_dirty(svm->vmcb, VMCB_SEG);
}
static void update_db_intercept(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- clr_exception_intercept(svm, DB_VECTOR);
- clr_exception_intercept(svm, BP_VECTOR);
+ svm->vmcb->control.intercept_exceptions &=
+ ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
if (svm->nmi_singlestep)
- set_exception_intercept(svm, DB_VECTOR);
+ svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- set_exception_intercept(svm, DB_VECTOR);
+ svm->vmcb->control.intercept_exceptions |=
+ 1 << DB_VECTOR;
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- set_exception_intercept(svm, BP_VECTOR);
+ svm->vmcb->control.intercept_exceptions |=
+ 1 << BP_VECTOR;
} else
vcpu->guest_debug = 0;
}
@@ -1532,11 +1146,23 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
else
svm->vmcb->save.dr7 = vcpu->arch.dr7;
- mark_dirty(svm->vmcb, VMCB_DR);
-
update_db_intercept(vcpu);
}
+static void load_host_msrs(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
+#endif
+}
+
+static void save_host_msrs(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
+#endif
+}
+
static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
{
if (sd->next_asid > sd->max_asid) {
@@ -1547,49 +1173,86 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid_generation = sd->asid_generation;
svm->vmcb->control.asid = sd->next_asid++;
-
- mark_dirty(svm->vmcb, VMCB_ASID);
}
-static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
+static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.dr7 = value;
- mark_dirty(svm->vmcb, VMCB_DR);
+ switch (dr) {
+ case 0 ... 3:
+ *dest = vcpu->arch.db[dr];
+ break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
+ case 6:
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ *dest = vcpu->arch.dr6;
+ else
+ *dest = svm->vmcb->save.dr6;
+ break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
+ case 7:
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ *dest = vcpu->arch.dr7;
+ else
+ *dest = svm->vmcb->save.dr7;
+ break;
+ }
+
+ return EMULATE_DONE;
}
-static int pf_interception(struct vcpu_svm *svm)
+static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
{
- u64 fault_address = svm->vmcb->control.exit_info_2;
- u32 error_code;
- int r = 1;
+ struct vcpu_svm *svm = to_svm(vcpu);
- switch (svm->apf_reason) {
- default:
- error_code = svm->vmcb->control.exit_info_1;
-
- trace_kvm_page_fault(fault_address, error_code);
- if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
- r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
- svm->vmcb->control.insn_bytes,
- svm->vmcb->control.insn_len);
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[dr] = value;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = value;
break;
- case KVM_PV_REASON_PAGE_NOT_PRESENT:
- svm->apf_reason = 0;
- local_irq_disable();
- kvm_async_pf_task_wait(fault_address);
- local_irq_enable();
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
+ case 6:
+ vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
break;
- case KVM_PV_REASON_PAGE_READY:
- svm->apf_reason = 0;
- local_irq_disable();
- kvm_async_pf_task_wake(fault_address);
- local_irq_enable();
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
+ case 7:
+ vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ svm->vmcb->save.dr7 = vcpu->arch.dr7;
+ vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
+ }
break;
}
- return r;
+
+ return EMULATE_DONE;
+}
+
+static int pf_interception(struct vcpu_svm *svm)
+{
+ u64 fault_address;
+ u32 error_code;
+
+ fault_address = svm->vmcb->control.exit_info_2;
+ error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
+ kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+ return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
}
static int db_interception(struct vcpu_svm *svm)
@@ -1612,7 +1275,7 @@ static int db_interception(struct vcpu_svm *svm)
}
if (svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc =
svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1637,7 +1300,7 @@ static int ud_interception(struct vcpu_svm *svm)
{
int er;
- er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
+ er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -1646,9 +1309,7 @@ static int ud_interception(struct vcpu_svm *svm)
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
-
- clr_exception_intercept(svm, NM_VECTOR);
-
+ svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
svm->vcpu.fpu_active = 1;
update_cr0_intercept(svm);
}
@@ -1659,59 +1320,8 @@ static int nm_interception(struct vcpu_svm *svm)
return 1;
}
-static bool is_erratum_383(void)
-{
- int err, i;
- u64 value;
-
- if (!erratum_383_found)
- return false;
-
- value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
- if (err)
- return false;
-
- /* Bit 62 may or may not be set for this mce */
- value &= ~(1ULL << 62);
-
- if (value != 0xb600000000010015ULL)
- return false;
-
- /* Clear MCi_STATUS registers */
- for (i = 0; i < 6; ++i)
- kvm_native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
-
- value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
- if (!err) {
- u32 low, high;
-
- value &= ~(1ULL << 2);
- low = lower_32_bits(value);
- high = upper_32_bits(value);
-
- kvm_native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
- }
-
- /* Flush tlb to evict multi-match entries */
- __flush_tlb_all();
-
- return true;
-}
-
-static void svm_handle_mce(struct vcpu_svm *svm)
+static int mc_interception(struct vcpu_svm *svm)
{
- if (is_erratum_383()) {
- /*
- * Erratum 383 triggered. Guest state is corrupt so kill the
- * guest.
- */
- pr_err("KVM: Guest triggered AMD Erratum 383\n");
-
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
-
- return;
- }
-
/*
* On an #MC intercept the MCE handler is not called automatically in
* the host. So do it by hand here.
@@ -1720,11 +1330,6 @@ static void svm_handle_mce(struct vcpu_svm *svm)
"int $0x12\n");
/* not sure if we ever come back to this point */
- return;
-}
-
-static int mc_interception(struct vcpu_svm *svm)
-{
return 1;
}
@@ -1745,23 +1350,29 @@ static int shutdown_interception(struct vcpu_svm *svm)
static int io_interception(struct vcpu_svm *svm)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
unsigned port;
++svm->vcpu.stat.io_exits;
+
+ svm->next_rip = svm->vmcb->control.exit_info_2;
+
string = (io_info & SVM_IOIO_STR_MASK) != 0;
- in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
- if (string || in)
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ if (string) {
+ if (emulate_instruction(&svm->vcpu,
+ 0, 0, 0) == EMULATE_DO_MMIO)
+ return 0;
+ return 1;
+ }
+
+ in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
- svm->next_rip = svm->vmcb->control.exit_info_2;
- skip_emulated_instruction(&svm->vcpu);
- return kvm_fast_pio_out(vcpu, size, port);
+ skip_emulated_instruction(&svm->vcpu);
+ return kvm_emulate_pio(&svm->vcpu, in, size, port);
}
static int nmi_interception(struct vcpu_svm *svm)
@@ -1795,56 +1406,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
return 1;
}
-static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return svm->nested.nested_cr3;
-}
-
-static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
- unsigned long root)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.nested_cr3 = root;
- mark_dirty(svm->vmcb, VMCB_NPT);
- svm_flush_tlb(vcpu);
-}
-
-static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.exit_code = SVM_EXIT_NPF;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = fault->error_code;
- svm->vmcb->control.exit_info_2 = fault->address;
-
- nested_svm_vmexit(svm);
-}
-
-static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
-{
- int r;
-
- r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
-
- vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
- vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
- vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
- vcpu->arch.mmu.shadow_root_level = get_npt_level();
- vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
-
- return r;
-}
-
-static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.walk_mmu = &vcpu->arch.mmu;
-}
-
static int nested_svm_check_permissions(struct vcpu_svm *svm)
{
if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1864,9 +1425,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code)
{
- int vmexit;
-
- if (!is_guest_mode(&svm->vcpu))
+ if (!is_nested(svm))
return 0;
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1874,36 +1433,21 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
svm->vmcb->control.exit_info_1 = error_code;
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
- vmexit = nested_svm_intercept(svm);
- if (vmexit == NESTED_EXIT_DONE)
- svm->nested.exit_required = true;
-
- return vmexit;
+ return nested_svm_exit_handled(svm);
}
-/* This function returns true if it is save to enable the irq window */
-static inline bool nested_svm_intr(struct vcpu_svm *svm)
+static inline int nested_svm_intr(struct vcpu_svm *svm)
{
- if (!is_guest_mode(&svm->vcpu))
- return true;
+ if (!is_nested(svm))
+ return 0;
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- return true;
+ return 0;
if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
- return false;
-
- /*
- * if vmexit was already requested (by intercepted exception
- * for instance) do not overwrite it with "external interrupt"
- * vmexit.
- */
- if (svm->nested.exit_required)
- return false;
+ return 0;
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
+ svm->vmcb->control.exit_code = SVM_EXIT_INTR;
if (svm->nested.intercept & 1ULL) {
/*
@@ -1914,40 +1458,22 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
*/
svm->nested.exit_required = true;
trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
- return false;
+ return 1;
}
- return true;
-}
-
-/* This function returns true if it is save to enable the nmi window */
-static inline bool nested_svm_nmi(struct vcpu_svm *svm)
-{
- if (!is_guest_mode(&svm->vcpu))
- return true;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
- return true;
-
- svm->vmcb->control.exit_code = SVM_EXIT_NMI;
- svm->nested.exit_required = true;
-
- return false;
+ return 0;
}
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx, struct page **mapped_page)
{
struct page *page;
- might_sleep();
-
page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
if (is_error_page(page))
goto error;
- *_page = page;
-
- return kmap(page);
+ *mapped_page = page;
+ return kmap_atomic(page, idx);
error:
kvm_release_page_clean(page);
@@ -1956,55 +1482,62 @@ error:
return NULL;
}
-static void nested_svm_unmap(struct page *page)
+static void nested_svm_unmap(void *addr, enum km_type idx, struct page *mapped_page)
{
- kunmap(page);
- kvm_release_page_dirty(page);
-}
-
-static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
-{
- unsigned port;
- u8 val, bit;
- u64 gpa;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
- return NESTED_EXIT_HOST;
+ struct page *page;
- port = svm->vmcb->control.exit_info_1 >> 16;
- gpa = svm->nested.vmcb_iopm + (port / 8);
- bit = port % 8;
- val = 0;
+ if (!addr)
+ return;
- if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
- val &= (1 << bit);
+ page = mapped_page;
- return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+ kunmap_atomic(addr, idx);
+ kvm_release_page_dirty(page);
}
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
- u32 offset, msr, value;
- int write, mask;
+ u32 param = svm->vmcb->control.exit_info_1 & 1;
+ u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ bool ret = false;
+ u32 t0, t1;
+ u8 *msrpm;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return NESTED_EXIT_HOST;
+ return false;
- msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- offset = svm_msrpm_offset(msr);
- write = svm->vmcb->control.exit_info_1 & 1;
- mask = 1 << ((2 * (msr & 0xf)) + write);
+ { struct page *mapped_page;
+ msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0, &mapped_page);
- if (offset == MSR_INVALID)
- return NESTED_EXIT_DONE;
+ if (!msrpm)
+ goto out;
- /* Offset is in 32 bit units but need in 8 bit units */
- offset *= 4;
+ switch (msr) {
+ case 0 ... 0x1fff:
+ t0 = (msr * 2) % 8;
+ t1 = msr / 8;
+ break;
+ case 0xc0000000 ... 0xc0001fff:
+ t0 = (8192 + msr - 0xc0000000) * 2;
+ t1 = (t0 / 8);
+ t0 %= 8;
+ break;
+ case 0xc0010000 ... 0xc0011fff:
+ t0 = (16384 + msr - 0xc0010000) * 2;
+ t1 = (t0 / 8);
+ t0 %= 8;
+ break;
+ default:
+ ret = true;
+ goto out;
+ }
+
+ ret = msrpm[t1] & ((1 << param) << t0);
- if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
- return NESTED_EXIT_DONE;
+out:
+ nested_svm_unmap(msrpm, KM_USER0, mapped_page); }
- return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+ return ret;
}
static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -2014,21 +1547,17 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
switch (exit_code) {
case SVM_EXIT_INTR:
case SVM_EXIT_NMI:
- case SVM_EXIT_EXCP_BASE + MC_VECTOR:
return NESTED_EXIT_HOST;
- case SVM_EXIT_NPF:
/* For now we are always handling NPFs when using them */
+ case SVM_EXIT_NPF:
if (npt_enabled)
return NESTED_EXIT_HOST;
break;
+ /* When we're shadowing, trap PFs */
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- /* When we're shadowing, trap PFs, but not async PF */
- if (!npt_enabled && svm->apf_reason == 0)
+ if (!npt_enabled)
return NESTED_EXIT_HOST;
break;
- case SVM_EXIT_EXCP_BASE + NM_VECTOR:
- nm_interception(svm);
- break;
default:
break;
}
@@ -2039,7 +1568,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
/*
* If this function returns true, this #vmexit was already handled
*/
-static int nested_svm_intercept(struct vcpu_svm *svm)
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
int vmexit = NESTED_EXIT_HOST;
@@ -2048,18 +1577,27 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
case SVM_EXIT_MSR:
vmexit = nested_svm_exit_handled_msr(svm);
break;
- case SVM_EXIT_IOIO:
- vmexit = nested_svm_intercept_ioio(svm);
+ case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
+ u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
+ if (svm->nested.intercept_cr_read & cr_bits)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
+ u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
+ if (svm->nested.intercept_cr_write & cr_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
- case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
- u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
- if (svm->nested.intercept_cr & bit)
+ }
+ case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
+ u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
+ if (svm->nested.intercept_dr_read & dr_bits)
vmexit = NESTED_EXIT_DONE;
break;
}
- case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
- u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
- if (svm->nested.intercept_dr & bit)
+ case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
+ u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
+ if (svm->nested.intercept_dr_write & dr_bits)
vmexit = NESTED_EXIT_DONE;
break;
}
@@ -2067,14 +1605,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
if (svm->nested.intercept_exceptions & excp_bits)
vmexit = NESTED_EXIT_DONE;
- /* async page fault always cause vmexit */
- else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
- svm->apf_reason != 0)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_ERR: {
- vmexit = NESTED_EXIT_DONE;
break;
}
default: {
@@ -2084,17 +1614,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
}
}
- return vmexit;
-}
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
-{
- int vmexit;
-
- vmexit = nested_svm_intercept(svm);
-
- if (vmexit == NESTED_EXIT_DONE)
+ if (vmexit == NESTED_EXIT_DONE) {
nested_svm_vmexit(svm);
+ }
return vmexit;
}
@@ -2104,8 +1626,10 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
struct vmcb_control_area *dst = &dst_vmcb->control;
struct vmcb_control_area *from = &from_vmcb->control;
- dst->intercept_cr = from->intercept_cr;
- dst->intercept_dr = from->intercept_dr;
+ dst->intercept_cr_read = from->intercept_cr_read;
+ dst->intercept_cr_write = from->intercept_cr_write;
+ dst->intercept_dr_read = from->intercept_dr_read;
+ dst->intercept_dr_write = from->intercept_dr_write;
dst->intercept_exceptions = from->intercept_exceptions;
dst->intercept = from->intercept;
dst->iopm_base_pa = from->iopm_base_pa;
@@ -2134,7 +1658,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
- struct page *page;
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
vmcb->control.exit_info_1,
@@ -2142,14 +1665,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb->control.exit_int_info,
vmcb->control.exit_int_info_err);
- nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
+ { struct page *mapped_page;
+ nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0, &mapped_page);
if (!nested_vmcb)
return 1;
- /* Exit Guest-Mode */
- leave_guest_mode(&svm->vcpu);
- svm->nested.vmcb = 0;
-
/* Give the current vmcb to the guest */
disable_gif(svm);
@@ -2159,11 +1679,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.ds = vmcb->save.ds;
nested_vmcb->save.gdtr = vmcb->save.gdtr;
nested_vmcb->save.idtr = vmcb->save.idtr;
- nested_vmcb->save.efer = svm->vcpu.arch.efer;
- nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
- nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ if (npt_enabled)
+ nested_vmcb->save.cr3 = vmcb->save.cr3;
nested_vmcb->save.cr2 = vmcb->save.cr2;
- nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
nested_vmcb->save.rflags = vmcb->save.rflags;
nested_vmcb->save.rip = vmcb->save.rip;
nested_vmcb->save.rsp = vmcb->save.rsp;
@@ -2181,7 +1699,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
- nested_vmcb->control.next_rip = vmcb->control.next_rip;
/*
* If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -2212,8 +1729,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
- svm->nested.nested_cr3 = 0;
-
/* Restore selected save entries */
svm->vmcb->save.es = hsave->save.es;
svm->vmcb->save.cs = hsave->save.cs;
@@ -2229,7 +1744,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cr3 = hsave->save.cr3;
svm->vcpu.arch.cr3 = hsave->save.cr3;
} else {
- (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+ kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
}
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2238,11 +1753,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
- mark_all_dirty(svm->vmcb);
+ /* Exit nested SVM mode */
+ svm->nested.vmcb = 0;
- nested_svm_unmap(page);
+ nested_svm_unmap(nested_vmcb, KM_USER0, mapped_page); }
- nested_svm_uninit_mmu_context(&svm->vcpu);
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
@@ -2251,47 +1766,20 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
- /*
- * This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is omptimized in that it only merges the parts where
- * the kvm msr permission bitmap may contain zero bits
- */
+ u32 *nested_msrpm;
int i;
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return true;
-
- for (i = 0; i < MSRPM_OFFSETS; i++) {
- u32 value, p;
- u64 offset;
-
- if (msrpm_offsets[i] == 0xffffffff)
- break;
-
- p = msrpm_offsets[i];
- offset = svm->nested.vmcb_msrpm + (p * 4);
-
- if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
- return false;
+ { struct page *mapped_page;
+ nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0, &mapped_page);
+ if (!nested_msrpm)
+ return false;
- svm->nested.msrpm[p] = svm->msrpm[p] | value;
- }
+ for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
+ svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
- return true;
-}
-
-static bool nested_vmcb_checks(struct vmcb *vmcb)
-{
- if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
- return false;
-
- if (vmcb->control.asid == 0)
- return false;
-
- if (vmcb->control.nested_ctl && !npt_enabled)
- return false;
+ nested_svm_unmap(nested_msrpm, KM_USER0, mapped_page); }
return true;
}
@@ -2301,45 +1789,27 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
- struct page *page;
- u64 vmcb_gpa;
- vmcb_gpa = svm->vmcb->save.rax;
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ { struct page *mapped_page;
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0, &mapped_page);
if (!nested_vmcb)
return false;
- if (!nested_vmcb_checks(nested_vmcb)) {
- nested_vmcb->control.exit_code = SVM_EXIT_ERR;
- nested_vmcb->control.exit_code_hi = 0;
- nested_vmcb->control.exit_info_1 = 0;
- nested_vmcb->control.exit_info_2 = 0;
-
- nested_svm_unmap(page);
+ /* nested_vmcb is our indicator if nested SVM is activated */
+ svm->nested.vmcb = svm->vmcb->save.rax;
- return false;
- }
-
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
nested_vmcb->save.rip,
nested_vmcb->control.int_ctl,
nested_vmcb->control.event_inj,
nested_vmcb->control.nested_ctl);
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
- nested_vmcb->control.intercept_cr >> 16,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept);
-
/* Clear internal status */
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
- /*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
- */
+ /* Save the old vmcb, so we don't need to pick what we save, but
+ can restore everything when a VMEXIT occurs */
hsave->save.es = vmcb->save.es;
hsave->save.cs = vmcb->save.cs;
hsave->save.ss = vmcb->save.ss;
@@ -2350,13 +1820,13 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
hsave->save.cr4 = svm->vcpu.arch.cr4;
hsave->save.rflags = vmcb->save.rflags;
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
+ hsave->save.rip = svm->next_rip;
hsave->save.rsp = vmcb->save.rsp;
hsave->save.rax = vmcb->save.rax;
if (npt_enabled)
hsave->save.cr3 = vmcb->save.cr3;
else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ hsave->save.cr3 = svm->vcpu.arch.cr3;
copy_vmcb_control_area(hsave, vmcb);
@@ -2365,12 +1835,6 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
else
svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
- if (nested_vmcb->control.nested_ctl) {
- kvm_mmu_unload(&svm->vcpu);
- svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
- nested_svm_init_mmu_context(&svm->vcpu);
- }
-
/* Load the nested guest state */
svm->vmcb->save.es = nested_vmcb->save.es;
svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -2385,17 +1849,14 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (npt_enabled) {
svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
- } else
- (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
-
- /* Guest paging mode is active - reset mmu */
- kvm_mmu_reset_context(&svm->vcpu);
-
+ } else {
+ kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+ kvm_mmu_reset_context(&svm->vcpu);
+ }
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
-
/* In case we don't even reach vcpu_run, the fields are not updated */
svm->vmcb->save.rax = nested_vmcb->save.rax;
svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -2404,55 +1865,48 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
svm->vmcb->save.cpl = nested_vmcb->save.cpl;
- svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
- svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
+ /* We don't want a nested guest to be more powerful than the guest,
+ so all intercepts are ORed */
+ svm->vmcb->control.intercept_cr_read |=
+ nested_vmcb->control.intercept_cr_read;
+ svm->vmcb->control.intercept_cr_write |=
+ nested_vmcb->control.intercept_cr_write;
+ svm->vmcb->control.intercept_dr_read |=
+ nested_vmcb->control.intercept_dr_read;
+ svm->vmcb->control.intercept_dr_write |=
+ nested_vmcb->control.intercept_dr_write;
+ svm->vmcb->control.intercept_exceptions |=
+ nested_vmcb->control.intercept_exceptions;
+
+ svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+
+ svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
/* cache intercepts */
- svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
- svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
+ svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
+ svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write;
+ svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
+ svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
svm->nested.intercept = nested_vmcb->control.intercept;
- svm_flush_tlb(&svm->vcpu);
+ force_new_asid(&svm->vcpu);
svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
svm->vcpu.arch.hflags |= HF_VINTR_MASK;
else
svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
- if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
- /* We only want the cr8 intercept bits of the guest */
- clr_cr_intercept(svm, INTERCEPT_CR8_READ);
- clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
- }
-
- /* We don't want to see VMMCALLs from a nested guest */
- clr_intercept(svm, INTERCEPT_VMMCALL);
-
- svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
svm->vmcb->control.int_state = nested_vmcb->control.int_state;
svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
- nested_svm_unmap(page);
-
- /* Enter Guest-Mode */
- enter_guest_mode(&svm->vcpu);
-
- /*
- * Merge guest and host intercepts - must be called with vcpu in
- * guest-mode to take affect here
- */
- recalc_intercepts(svm);
-
- svm->nested.vmcb = vmcb_gpa;
+ nested_svm_unmap(nested_vmcb, KM_USER0, mapped_page); }
enable_gif(svm);
- mark_all_dirty(svm->vmcb);
-
return true;
}
@@ -2475,7 +1929,6 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
static int vmload_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
- struct page *page;
if (nested_svm_check_permissions(svm))
return 1;
@@ -2483,12 +1936,13 @@ static int vmload_interception(struct vcpu_svm *svm)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ { struct page *mapped_page;
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0, &mapped_page);
if (!nested_vmcb)
return 1;
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- nested_svm_unmap(page);
+ nested_svm_unmap(nested_vmcb, KM_USER0, mapped_page); }
return 1;
}
@@ -2496,7 +1950,6 @@ static int vmload_interception(struct vcpu_svm *svm)
static int vmsave_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
- struct page *page;
if (nested_svm_check_permissions(svm))
return 1;
@@ -2504,12 +1957,13 @@ static int vmsave_interception(struct vcpu_svm *svm)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ { struct page *mapped_page;
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0, &mapped_page);
if (!nested_vmcb)
return 1;
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- nested_svm_unmap(page);
+ nested_svm_unmap(nested_vmcb, KM_USER0, mapped_page); }
return 1;
}
@@ -2519,8 +1973,8 @@ static int vmrun_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
- /* Save rip after vmrun instruction */
- kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
if (!nested_svm_vmrun(svm))
return 1;
@@ -2549,7 +2003,6 @@ static int stgi_interception(struct vcpu_svm *svm)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
enable_gif(svm);
@@ -2570,8 +2023,6 @@ static int clgi_interception(struct vcpu_svm *svm)
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
- mark_dirty(svm->vmcb, VMCB_INTR);
-
return 1;
}
@@ -2598,19 +2049,6 @@ static int skinit_interception(struct vcpu_svm *svm)
return 1;
}
-static int xsetbv_interception(struct vcpu_svm *svm)
-{
- u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
- u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
-
- if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
- }
-
- return 1;
-}
-
static int invalid_op_interception(struct vcpu_svm *svm)
{
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2628,8 +2066,6 @@ static int task_switch_interception(struct vcpu_svm *svm)
svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
uint32_t idt_v =
svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
- bool has_error_code = false;
- u32 error_code = 0;
tss_selector = (u16)svm->vmcb->control.exit_info_1;
@@ -2650,12 +2086,6 @@ static int task_switch_interception(struct vcpu_svm *svm)
svm->vcpu.arch.nmi_injected = false;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
- if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
- has_error_code = true;
- error_code =
- (u32)svm->vmcb->control.exit_info_2;
- }
kvm_clear_exception_queue(&svm->vcpu);
break;
case SVM_EXITINTINFO_TYPE_INTR:
@@ -2672,14 +2102,7 @@ static int task_switch_interception(struct vcpu_svm *svm)
(int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
skip_emulated_instruction(&svm->vcpu);
- if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
- svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- svm->vcpu.run->internal.ndata = 0;
- return 0;
- }
- return 1;
+ return kvm_task_switch(&svm->vcpu, tss_selector, reason);
}
static int cpuid_interception(struct vcpu_svm *svm)
@@ -2692,151 +2115,38 @@ static int cpuid_interception(struct vcpu_svm *svm)
static int iret_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.nmi_window_exits;
- clr_intercept(svm, INTERCEPT_IRET);
+ svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
return 1;
}
static int invlpg_interception(struct vcpu_svm *svm)
{
- if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
-
- kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
- skip_emulated_instruction(&svm->vcpu);
+ if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
+ pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
return 1;
}
static int emulate_on_interception(struct vcpu_svm *svm)
{
- return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
-}
-
-#define CR_VALID (1ULL << 63)
-
-static int cr_interception(struct vcpu_svm *svm)
-{
- int reg, cr;
- unsigned long val;
- int err;
-
- if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
-
- if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
- return emulate_on_interception(svm);
-
- reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
- cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
-
- err = 0;
- if (cr >= 16) { /* mov to cr */
- cr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
- switch (cr) {
- case 0:
- err = kvm_set_cr0(&svm->vcpu, val);
- break;
- case 3:
- err = kvm_set_cr3(&svm->vcpu, val);
- break;
- case 4:
- err = kvm_set_cr4(&svm->vcpu, val);
- break;
- case 8:
- err = kvm_set_cr8(&svm->vcpu, val);
- break;
- default:
- WARN(1, "unhandled write to CR%d", cr);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
- }
- } else { /* mov from cr */
- switch (cr) {
- case 0:
- val = kvm_read_cr0(&svm->vcpu);
- break;
- case 2:
- val = svm->vcpu.arch.cr2;
- break;
- case 3:
- val = kvm_read_cr3(&svm->vcpu);
- break;
- case 4:
- val = kvm_read_cr4(&svm->vcpu);
- break;
- case 8:
- val = kvm_get_cr8(&svm->vcpu);
- break;
- default:
- WARN(1, "unhandled read from CR%d", cr);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
- }
- kvm_register_write(&svm->vcpu, reg, val);
- }
- kvm_complete_insn_gp(&svm->vcpu, err);
-
- return 1;
-}
-
-static int cr0_write_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- int r;
-
- r = cr_interception(svm);
-
- if (svm->nested.vmexit_rip) {
- kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
- kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
- kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
- svm->nested.vmexit_rip = 0;
- }
-
- return r;
-}
-
-static int dr_interception(struct vcpu_svm *svm)
-{
- int reg, dr;
- unsigned long val;
- int err;
-
- if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
-
- reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
- dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
-
- if (dr >= 16) { /* mov to DRn */
- val = kvm_register_read(&svm->vcpu, reg);
- kvm_set_dr(&svm->vcpu, dr - 16, val);
- } else {
- err = kvm_get_dr(&svm->vcpu, dr, &val);
- if (!err)
- kvm_register_write(&svm->vcpu, reg, val);
- }
-
- skip_emulated_instruction(&svm->vcpu);
-
+ if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
+ pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
return 1;
}
static int cr8_write_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
- int r;
u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
/* instruction emulation calls kvm_set_cr8() */
- r = cr_interception(svm);
+ emulate_instruction(&svm->vcpu, 0, 0, 0);
if (irqchip_in_kernel(svm->vcpu.kvm)) {
- clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
- return r;
+ svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+ return 1;
}
if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
- return r;
+ return 1;
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
@@ -2847,12 +2157,17 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
switch (ecx) {
case MSR_IA32_TSC: {
- struct vmcb *vmcb = get_host_vmcb(svm);
+ u64 tsc_offset;
- *data = vmcb->control.tsc_offset + kvm_native_read_tsc();
+ if (is_nested(svm))
+ tsc_offset = svm->nested.hsave->control.tsc_offset;
+ else
+ tsc_offset = svm->vmcb->control.tsc_offset;
+
+ *data = tsc_offset + kvm_native_read_tsc();
break;
}
- case MSR_STAR:
+ case MSR_K6_STAR:
*data = svm->vmcb->save.star;
break;
#ifdef CONFIG_X86_64
@@ -2878,11 +2193,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
case MSR_IA32_SYSENTER_ESP:
*data = svm->sysenter_esp;
break;
- /*
- * Nobody will change the following 5 values in the VMCB so we can
- * safely return them on rdmsr. They will always be 0 until LBRV is
- * implemented.
- */
+ /* Nobody will change the following 5 values in the VMCB so
+ we can safely return them on rdmsr. They will always be 0
+ until LBRV is implemented. */
case MSR_IA32_DEBUGCTLMSR:
*data = svm->vmcb->save.dbgctl;
break;
@@ -2902,7 +2215,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
*data = svm->nested.hsave_msr;
break;
case MSR_VM_CR:
- *data = svm->nested.vm_cr_msr;
+ *data = 0;
break;
case MSR_IA32_UCODE_REV:
*data = 0x01000065;
@@ -2932,40 +2245,26 @@ static int rdmsr_interception(struct vcpu_svm *svm)
return 1;
}
-static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
{
struct vcpu_svm *svm = to_svm(vcpu);
- int svm_dis, chg_mask;
-
- if (data & ~SVM_VM_CR_VALID_MASK)
- return 1;
-
- chg_mask = SVM_VM_CR_VALID_MASK;
- if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
- chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
-
- svm->nested.vm_cr_msr &= ~chg_mask;
- svm->nested.vm_cr_msr |= (data & chg_mask);
-
- svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
-
- /* check for svm_disable while efer.svme is set */
- if (svm_dis && (vcpu->arch.efer & EFER_SVME))
- return 1;
+ switch (ecx) {
+ case MSR_IA32_TSC: {
+ u64 tsc_offset = data - kvm_native_read_tsc();
+ u64 g_tsc_offset = 0;
- return 0;
-}
+ if (is_nested(svm)) {
+ g_tsc_offset = svm->vmcb->control.tsc_offset -
+ svm->nested.hsave->control.tsc_offset;
+ svm->nested.hsave->control.tsc_offset = tsc_offset;
+ }
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
+ svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
- switch (ecx) {
- case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, data);
break;
- case MSR_STAR:
+ }
+ case MSR_K6_STAR:
svm->vmcb->save.star = data;
break;
#ifdef CONFIG_X86_64
@@ -2994,7 +2293,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->vmcb->save.sysenter_esp = data;
break;
case MSR_IA32_DEBUGCTLMSR:
- if (!boot_cpu_has(X86_FEATURE_LBRV)) {
+ if (!svm_has(SVM_FEATURE_LBRV)) {
pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
__func__, data);
break;
@@ -3003,7 +2302,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
return 1;
svm->vmcb->save.dbgctl = data;
- mark_dirty(svm->vmcb, VMCB_LBR);
if (data & (1ULL<<0))
svm_enable_lbrv(svm);
else
@@ -3013,7 +2311,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->nested.hsave_msr = data;
break;
case MSR_VM_CR:
- return svm_set_vm_cr(vcpu, data);
case MSR_VM_IGNNE:
pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
@@ -3053,10 +2350,8 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
- mark_dirty(svm->vmcb, VMCB_INTR);
/*
* If the user space waits to inject interrupts, exit as soon as
* possible
@@ -3079,42 +2374,43 @@ static int pause_interception(struct vcpu_svm *svm)
}
static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
- [SVM_EXIT_READ_CR0] = cr_interception,
- [SVM_EXIT_READ_CR3] = cr_interception,
- [SVM_EXIT_READ_CR4] = cr_interception,
- [SVM_EXIT_READ_CR8] = cr_interception,
+ [SVM_EXIT_READ_CR0] = emulate_on_interception,
+ [SVM_EXIT_READ_CR3] = emulate_on_interception,
+ [SVM_EXIT_READ_CR4] = emulate_on_interception,
+ [SVM_EXIT_READ_CR8] = emulate_on_interception,
[SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
- [SVM_EXIT_WRITE_CR3] = cr_interception,
- [SVM_EXIT_WRITE_CR4] = cr_interception,
- [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
- [SVM_EXIT_READ_DR0] = dr_interception,
- [SVM_EXIT_READ_DR1] = dr_interception,
- [SVM_EXIT_READ_DR2] = dr_interception,
- [SVM_EXIT_READ_DR3] = dr_interception,
- [SVM_EXIT_READ_DR4] = dr_interception,
- [SVM_EXIT_READ_DR5] = dr_interception,
- [SVM_EXIT_READ_DR6] = dr_interception,
- [SVM_EXIT_READ_DR7] = dr_interception,
- [SVM_EXIT_WRITE_DR0] = dr_interception,
- [SVM_EXIT_WRITE_DR1] = dr_interception,
- [SVM_EXIT_WRITE_DR2] = dr_interception,
- [SVM_EXIT_WRITE_DR3] = dr_interception,
- [SVM_EXIT_WRITE_DR4] = dr_interception,
- [SVM_EXIT_WRITE_DR5] = dr_interception,
- [SVM_EXIT_WRITE_DR6] = dr_interception,
- [SVM_EXIT_WRITE_DR7] = dr_interception,
+ [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
+ [SVM_EXIT_READ_DR0] = emulate_on_interception,
+ [SVM_EXIT_READ_DR1] = emulate_on_interception,
+ [SVM_EXIT_READ_DR2] = emulate_on_interception,
+ [SVM_EXIT_READ_DR3] = emulate_on_interception,
+ [SVM_EXIT_READ_DR4] = emulate_on_interception,
+ [SVM_EXIT_READ_DR5] = emulate_on_interception,
+ [SVM_EXIT_READ_DR6] = emulate_on_interception,
+ [SVM_EXIT_READ_DR7] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
[SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
- [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
- [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
- [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
- [SVM_EXIT_INTR] = intr_interception,
+ [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
+ [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
+ [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
[SVM_EXIT_SMI] = nop_on_interception,
[SVM_EXIT_INIT] = nop_on_interception,
[SVM_EXIT_VINTR] = interrupt_window_interception,
+ /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_IRET] = iret_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
@@ -3122,7 +2418,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
[SVM_EXIT_INVLPGA] = invlpga_interception,
- [SVM_EXIT_IOIO] = io_interception,
+ [SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_MSR] = msr_interception,
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
[SVM_EXIT_SHUTDOWN] = shutdown_interception,
@@ -3136,123 +2432,16 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_WBINVD] = emulate_on_interception,
[SVM_EXIT_MONITOR] = invalid_op_interception,
[SVM_EXIT_MWAIT] = invalid_op_interception,
- [SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_NPF] = pf_interception,
};
-void dump_vmcb(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_control_area *control = &svm->vmcb->control;
- struct vmcb_save_area *save = &svm->vmcb->save;
-
- pr_err("VMCB Control Area:\n");
- pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
- pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
- pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
- pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
- pr_err("exceptions: %08x\n", control->intercept_exceptions);
- pr_err("intercepts: %016llx\n", control->intercept);
- pr_err("pause filter count: %d\n", control->pause_filter_count);
- pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
- pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
- pr_err("tsc_offset: %016llx\n", control->tsc_offset);
- pr_err("asid: %d\n", control->asid);
- pr_err("tlb_ctl: %d\n", control->tlb_ctl);
- pr_err("int_ctl: %08x\n", control->int_ctl);
- pr_err("int_vector: %08x\n", control->int_vector);
- pr_err("int_state: %08x\n", control->int_state);
- pr_err("exit_code: %08x\n", control->exit_code);
- pr_err("exit_info1: %016llx\n", control->exit_info_1);
- pr_err("exit_info2: %016llx\n", control->exit_info_2);
- pr_err("exit_int_info: %08x\n", control->exit_int_info);
- pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
- pr_err("nested_ctl: %lld\n", control->nested_ctl);
- pr_err("nested_cr3: %016llx\n", control->nested_cr3);
- pr_err("event_inj: %08x\n", control->event_inj);
- pr_err("event_inj_err: %08x\n", control->event_inj_err);
- pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
- pr_err("next_rip: %016llx\n", control->next_rip);
- pr_err("VMCB State Save Area:\n");
- pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
- save->es.selector, save->es.attrib,
- save->es.limit, save->es.base);
- pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
- save->cs.selector, save->cs.attrib,
- save->cs.limit, save->cs.base);
- pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
- save->ss.selector, save->ss.attrib,
- save->ss.limit, save->ss.base);
- pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
- save->ds.selector, save->ds.attrib,
- save->ds.limit, save->ds.base);
- pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
- save->fs.selector, save->fs.attrib,
- save->fs.limit, save->fs.base);
- pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
- save->gs.selector, save->gs.attrib,
- save->gs.limit, save->gs.base);
- pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->gdtr.selector, save->gdtr.attrib,
- save->gdtr.limit, save->gdtr.base);
- pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->ldtr.selector, save->ldtr.attrib,
- save->ldtr.limit, save->ldtr.base);
- pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->idtr.selector, save->idtr.attrib,
- save->idtr.limit, save->idtr.base);
- pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->tr.selector, save->tr.attrib,
- save->tr.limit, save->tr.base);
- pr_err("cpl: %d efer: %016llx\n",
- save->cpl, save->efer);
- pr_err("cr0: %016llx cr2: %016llx\n",
- save->cr0, save->cr2);
- pr_err("cr3: %016llx cr4: %016llx\n",
- save->cr3, save->cr4);
- pr_err("dr6: %016llx dr7: %016llx\n",
- save->dr6, save->dr7);
- pr_err("rip: %016llx rflags: %016llx\n",
- save->rip, save->rflags);
- pr_err("rsp: %016llx rax: %016llx\n",
- save->rsp, save->rax);
- pr_err("star: %016llx lstar: %016llx\n",
- save->star, save->lstar);
- pr_err("cstar: %016llx sfmask: %016llx\n",
- save->cstar, save->sfmask);
- pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
- save->kernel_gs_base, save->sysenter_cs);
- pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
- save->sysenter_esp, save->sysenter_eip);
- pr_err("gpat: %016llx dbgctl: %016llx\n",
- save->g_pat, save->dbgctl);
- pr_err("br_from: %016llx br_to: %016llx\n",
- save->br_from, save->br_to);
- pr_err("excp_from: %016llx excp_to: %016llx\n",
- save->last_excp_from, save->last_excp_to);
-
-}
-
-static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
-{
- struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
-
- *info1 = control->exit_info_1;
- *info2 = control->exit_info_2;
-}
-
static int handle_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
-
- if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
- vcpu->arch.cr0 = svm->vmcb->save.cr0;
- if (npt_enabled)
- vcpu->arch.cr3 = svm->vmcb->save.cr3;
+ trace_kvm_exit(exit_code, svm->vmcb->save.rip);
if (unlikely(svm->nested.exit_required)) {
nested_svm_vmexit(svm);
@@ -3261,7 +2450,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
return 1;
}
- if (is_guest_mode(vcpu)) {
+ if (is_nested(svm)) {
int vmexit;
trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3281,19 +2470,21 @@ static int handle_exit(struct kvm_vcpu *vcpu)
svm_complete_interrupts(svm);
+ if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+
if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
= svm->vmcb->control.exit_code;
- pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
- dump_vmcb(vcpu);
return 0;
}
if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
- exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
+ exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
"exit_code 0x%x\n",
__func__, svm->vmcb->control.exit_int_info,
@@ -3324,6 +2515,7 @@ static void pre_svm_run(struct vcpu_svm *svm)
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
/* FIXME: handle wraparound of asid_generation */
if (svm->asid_generation != sd->asid_generation)
new_asid(svm, sd);
@@ -3335,7 +2527,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- set_intercept(svm, INTERCEPT_IRET);
+ svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
@@ -3343,12 +2535,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
{
struct vmcb_control_area *control;
+ trace_kvm_inj_virq(irq);
+
+ ++svm->vcpu.stat.irq_injections;
control = &svm->vmcb->control;
control->int_vector = irq;
control->int_ctl &= ~V_INTR_PRIO_MASK;
control->int_ctl |= V_IRQ_MASK |
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
- mark_dirty(svm->vmcb, VMCB_INTR);
}
static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -3357,9 +2551,6 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
BUG_ON(!(gif_set(svm)));
- trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
- ++vcpu->stat.irq_injections;
-
svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
@@ -3368,26 +2559,19 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
- return;
-
if (irr == -1)
return;
if (tpr >= irr)
- set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
}
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- int ret;
- ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- !(svm->vcpu.arch.hflags & HF_NMI_MASK);
- ret = ret && gif_set(svm) && nested_svm_nmi(svm);
-
- return ret;
+ return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+ !(svm->vcpu.arch.hflags & HF_NMI_MASK);
}
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -3403,10 +2587,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
if (masked) {
svm->vcpu.arch.hflags |= HF_NMI_MASK;
- set_intercept(svm, INTERCEPT_IRET);
+ svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
} else {
svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- clr_intercept(svm, INTERCEPT_IRET);
+ svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
}
}
@@ -3422,7 +2606,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
- if (is_guest_mode(vcpu))
+ if (is_nested(svm))
return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
return ret;
@@ -3432,13 +2616,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- /*
- * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
- * 1, because that's a separate STGI/VMRUN intercept. The next time we
- * get that intercept, this function will be called again though and
- * we'll get the vintr intercept.
- */
- if (gif_set(svm) && nested_svm_intr(svm)) {
+ nested_svm_intr(svm);
+
+ /* In case GIF=0 we can't rely on the CPU to tell us when
+ * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
+ * The next time we get that intercept, this function will be
+ * called again though and we'll get the vintr intercept. */
+ if (gif_set(svm)) {
svm_set_vintr(svm);
svm_inject_irq(svm, 0x0);
}
@@ -3452,10 +2636,9 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
== HF_NMI_MASK)
return; /* IRET will cause a vm exit */
- /*
- * Something prevents NMI from been injected. Single step over possible
- * problem (IRET or exception injection or interrupt shadow)
- */
+ /* Something prevents NMI from been injected. Single step over
+ possible problem (IRET or exception injection or interrupt
+ shadow) */
svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
update_db_intercept(vcpu);
@@ -3468,12 +2651,7 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
static void svm_flush_tlb(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
- else
- svm->asid_generation--;
+ force_new_asid(vcpu);
}
static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3484,10 +2662,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
- return;
-
- if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
+ if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
kvm_set_cr8(vcpu, cr8);
}
@@ -3498,9 +2673,6 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
- if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
- return;
-
cr8 = kvm_get_cr8(vcpu);
svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
@@ -3511,14 +2683,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
u8 vector;
int type;
u32 exitintinfo = svm->vmcb->control.exit_int_info;
- unsigned int3_injected = svm->int3_injected;
- svm->int3_injected = 0;
-
- if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
+ if (svm->vcpu.arch.hflags & HF_IRET_MASK)
svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
- }
svm->vcpu.arch.nmi_injected = false;
kvm_clear_exception_queue(&svm->vcpu);
@@ -3527,8 +2694,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
if (!(exitintinfo & SVM_EXITINTINFO_VALID))
return;
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-
vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
@@ -3537,25 +2702,18 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
svm->vcpu.arch.nmi_injected = true;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
- /*
- * In case of software exceptions, do not reinject the vector,
- * but re-execute the instruction instead. Rewind RIP first
- * if we emulated INT3 before.
- */
- if (kvm_exception_is_soft(vector)) {
- if (vector == BP_VECTOR && int3_injected &&
- kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
- kvm_rip_write(&svm->vcpu,
- kvm_rip_read(&svm->vcpu) -
- int3_injected);
+ /* In case of software exception do not reinject an exception
+ vector, but re-execute and instruction instead */
+ if (is_nested(svm))
+ break;
+ if (kvm_exception_is_soft(vector))
break;
- }
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
u32 err = svm->vmcb->control.exit_int_info_err;
- kvm_requeue_exception_e(&svm->vcpu, vector, err);
+ kvm_queue_exception_e(&svm->vcpu, vector, err);
} else
- kvm_requeue_exception(&svm->vcpu, vector);
+ kvm_queue_exception(&svm->vcpu, vector);
break;
case SVM_EXITINTINFO_TYPE_INTR:
kvm_queue_interrupt(&svm->vcpu, vector, false);
@@ -3565,17 +2723,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
}
}
-static void svm_cancel_injection(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_control_area *control = &svm->vmcb->control;
-
- control->exit_int_info = control->event_inj;
- control->exit_int_info_err = control->event_inj_err;
- control->event_inj = 0;
- svm_complete_interrupts(svm);
-}
-
#ifdef CONFIG_X86_64
#define R "r"
#else
@@ -3585,10 +2732,9 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
static void svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+ u16 fs_selector;
+ u16 gs_selector;
+ u16 ldt_selector;
/*
* A vmexit emulation is required before the vcpu can be executed
@@ -3597,11 +2743,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->nested.exit_required))
return;
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
pre_svm_run(svm);
sync_lapic_to_cr8(vcpu);
+ save_host_msrs(vcpu);
+ fs_selector = kvm_read_fs();
+ gs_selector = kvm_read_gs();
+ ldt_selector = kvm_read_ldt();
svm->vmcb->save.cr2 = vcpu->arch.cr2;
+ /* required for live migration with NPT */
+ if (npt_enabled)
+ svm->vmcb->save.cr3 = vcpu->arch.cr3;
clgi();
@@ -3678,11 +2835,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
- loadsegment(fs, svm->host.fs);
-#endif
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+
+ kvm_load_fs(fs_selector);
+ kvm_load_gs(gs_selector);
+ kvm_load_ldt(ldt_selector);
+ load_host_msrs(vcpu);
reload_tss(vcpu);
@@ -3690,35 +2851,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
stgi();
- vcpu->arch.cr2 = svm->vmcb->save.cr2;
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
-
sync_cr8_to_lapic(vcpu);
svm->next_rip = 0;
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
-
- /* if exit due to PF check for async PF */
- if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
- svm->apf_reason = kvm_read_and_reset_pf_reason();
-
if (npt_enabled) {
vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
}
-
- /*
- * We need to handle MC intercepts here before the vcpu has a chance to
- * change the physical cpu
- */
- if (unlikely(svm->vmcb->control.exit_code ==
- SVM_EXIT_EXCP_BASE + MC_VECTOR))
- svm_handle_mce(svm);
-
- mark_all_clean(svm->vmcb);
}
#undef R
@@ -3727,23 +2867,14 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.cr3 = root;
- mark_dirty(svm->vmcb, VMCB_CR);
- svm_flush_tlb(vcpu);
-}
-
-static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.nested_cr3 = root;
- mark_dirty(svm->vmcb, VMCB_NPT);
-
- /* Also sync guest cr3 here in case we live migrate */
- svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
- mark_dirty(svm->vmcb, VMCB_CR);
+ if (npt_enabled) {
+ svm->vmcb->control.nested_cr3 = root;
+ force_new_asid(vcpu);
+ return;
+ }
- svm_flush_tlb(vcpu);
+ svm->vmcb->save.cr3 = root;
+ force_new_asid(vcpu);
}
static int is_disabled(void)
@@ -3778,61 +2909,43 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
}
-static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+static int get_npt_level(void)
{
- return 0;
+#ifdef CONFIG_X86_64
+ return PT64_ROOT_LEVEL;
+#else
+ return PT32E_ROOT_LEVEL;
+#endif
}
-static void svm_cpuid_update(struct kvm_vcpu *vcpu)
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
+ return 0;
}
-static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
- switch (func) {
- case 0x80000001:
- if (nested)
- entry->ecx |= (1 << 2); /* Set SVM bit */
- break;
- case 0x8000000A:
- entry->eax = 1; /* SVM revision 1 */
- entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
- ASID emulation to nested SVM */
- entry->ecx = 0; /* Reserved */
- entry->edx = 0; /* Per default do not support any
- additional features */
-
- /* Support next_rip if host supports it */
- if (boot_cpu_has(X86_FEATURE_NRIPS))
- entry->edx |= SVM_FEATURE_NRIP;
-
- /* Support NPT for the guest if enabled */
- if (npt_enabled)
- entry->edx |= SVM_FEATURE_NPT;
-
- break;
- }
}
static const struct trace_print_flags svm_exit_reasons_str[] = {
- { SVM_EXIT_READ_CR0, "read_cr0" },
- { SVM_EXIT_READ_CR3, "read_cr3" },
- { SVM_EXIT_READ_CR4, "read_cr4" },
- { SVM_EXIT_READ_CR8, "read_cr8" },
- { SVM_EXIT_WRITE_CR0, "write_cr0" },
- { SVM_EXIT_WRITE_CR3, "write_cr3" },
- { SVM_EXIT_WRITE_CR4, "write_cr4" },
- { SVM_EXIT_WRITE_CR8, "write_cr8" },
- { SVM_EXIT_READ_DR0, "read_dr0" },
- { SVM_EXIT_READ_DR1, "read_dr1" },
- { SVM_EXIT_READ_DR2, "read_dr2" },
- { SVM_EXIT_READ_DR3, "read_dr3" },
- { SVM_EXIT_WRITE_DR0, "write_dr0" },
- { SVM_EXIT_WRITE_DR1, "write_dr1" },
- { SVM_EXIT_WRITE_DR2, "write_dr2" },
- { SVM_EXIT_WRITE_DR3, "write_dr3" },
- { SVM_EXIT_WRITE_DR5, "write_dr5" },
- { SVM_EXIT_WRITE_DR7, "write_dr7" },
+ { SVM_EXIT_READ_CR0, "read_cr0" },
+ { SVM_EXIT_READ_CR3, "read_cr3" },
+ { SVM_EXIT_READ_CR4, "read_cr4" },
+ { SVM_EXIT_READ_CR8, "read_cr8" },
+ { SVM_EXIT_WRITE_CR0, "write_cr0" },
+ { SVM_EXIT_WRITE_CR3, "write_cr3" },
+ { SVM_EXIT_WRITE_CR4, "write_cr4" },
+ { SVM_EXIT_WRITE_CR8, "write_cr8" },
+ { SVM_EXIT_READ_DR0, "read_dr0" },
+ { SVM_EXIT_READ_DR1, "read_dr1" },
+ { SVM_EXIT_READ_DR2, "read_dr2" },
+ { SVM_EXIT_READ_DR3, "read_dr3" },
+ { SVM_EXIT_WRITE_DR0, "write_dr0" },
+ { SVM_EXIT_WRITE_DR1, "write_dr1" },
+ { SVM_EXIT_WRITE_DR2, "write_dr2" },
+ { SVM_EXIT_WRITE_DR3, "write_dr3" },
+ { SVM_EXIT_WRITE_DR5, "write_dr5" },
+ { SVM_EXIT_WRITE_DR7, "write_dr7" },
{ SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
{ SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
{ SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
@@ -3863,7 +2976,6 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
{ SVM_EXIT_WBINVD, "wbinvd" },
{ SVM_EXIT_MONITOR, "monitor" },
{ SVM_EXIT_MWAIT, "mwait" },
- { SVM_EXIT_XSETBV, "xsetbv" },
{ SVM_EXIT_NPF, "npf" },
{ -1, NULL }
};
@@ -3878,17 +2990,12 @@ static bool svm_rdtscp_supported(void)
return false;
}
-static bool svm_has_wbinvd_exit(void)
-{
- return true;
-}
-
static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- set_exception_intercept(svm, NM_VECTOR);
update_cr0_intercept(svm);
+ svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
}
static struct kvm_x86_ops svm_x86_ops = {
@@ -3918,7 +3025,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
- .decache_cr3 = svm_decache_cr3,
.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
.set_cr0 = svm_set_cr0,
.set_cr3 = svm_set_cr3,
@@ -3928,7 +3034,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
- .set_dr7 = svm_set_dr7,
+ .get_dr = svm_get_dr,
+ .set_dr = svm_set_dr,
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
@@ -3946,7 +3053,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_irq = svm_set_irq,
.set_nmi = svm_inject_nmi,
.queue_exception = svm_queue_exception,
- .cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
.nmi_allowed = svm_nmi_allowed,
.get_nmi_mask = svm_get_nmi_mask,
@@ -3959,29 +3065,18 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
- .get_exit_info = svm_get_exit_info,
.exit_reasons_str = svm_exit_reasons_str,
-
.get_lpage_level = svm_get_lpage_level,
.cpuid_update = svm_cpuid_update,
.rdtscp_supported = svm_rdtscp_supported,
-
- .set_supported_cpuid = svm_set_supported_cpuid,
-
- .has_wbinvd_exit = svm_has_wbinvd_exit,
-
- .write_tsc_offset = svm_write_tsc_offset,
- .adjust_tsc_offset = svm_adjust_tsc_offset,
-
- .set_tdp_cr3 = set_tdp_cr3,
};
static int __init svm_init(void)
{
return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
- __alignof__(struct vcpu_svm), THIS_MODULE);
+ THIS_MODULE);
}
static void __exit svm_exit(void)
diff --git a/linux/x86/timer.c b/linux/x86/timer.c
index 1fa2aa0..1a67e02 100644
--- a/linux/x86/timer.c
+++ b/linux/x86/timer.c
@@ -38,20 +38,6 @@
#endif
#endif
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * timer support
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/hrtimer.h>
@@ -66,13 +52,12 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
/*
* There is a race window between reading and incrementing, but we do
* not care about potentially loosing timer events in the !reinject
- * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
- * in vcpu_enter_guest.
+ * case anyway.
*/
if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
atomic_inc(&ktimer->pending);
/* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
}
if (waitqueue_active(q))
diff --git a/linux/x86/trace.h b/linux/x86/trace.h
index 1357d7c..6ad30a2 100644
--- a/linux/x86/trace.h
+++ b/linux/x86/trace.h
@@ -5,6 +5,8 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#define TRACE_INCLUDE_FILE trace
/*
* Tracepoint for guest mode entry.
@@ -178,36 +180,27 @@ TRACE_EVENT(kvm_apic,
#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
-#define KVM_ISA_VMX 1
-#define KVM_ISA_SVM 2
-
/*
* Tracepoint for kvm guest exit:
*/
TRACE_EVENT(kvm_exit,
- TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
- TP_ARGS(exit_reason, vcpu, isa),
+ TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
+ TP_ARGS(exit_reason, guest_rip),
TP_STRUCT__entry(
__field( unsigned int, exit_reason )
__field( unsigned long, guest_rip )
- __field( u32, isa )
- __field( u64, info1 )
- __field( u64, info2 )
),
TP_fast_assign(
__entry->exit_reason = exit_reason;
- __entry->guest_rip = kvm_rip_read(vcpu);
- __entry->isa = isa;
- kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
- &__entry->info2);
+ __entry->guest_rip = guest_rip;
),
- TP_printk("reason %s rip 0x%lx info %llx %llx",
+ TP_printk("reason %s rip 0x%lx",
ftrace_print_symbols_seq(p, __entry->exit_reason,
kvm_x86_ops->exit_reasons_str),
- __entry->guest_rip, __entry->info1, __entry->info2)
+ __entry->guest_rip)
);
/*
@@ -228,38 +221,6 @@ TRACE_EVENT(kvm_inj_virq,
TP_printk("irq %u", __entry->irq)
);
-#define EXS(x) { x##_VECTOR, "#" #x }
-
-#define kvm_trace_sym_exc \
- EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
- EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
- EXS(MF), EXS(MC)
-
-/*
- * Tracepoint for kvm interrupt injection:
- */
-TRACE_EVENT(kvm_inj_exception,
- TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
- TP_ARGS(exception, has_error, error_code),
-
- TP_STRUCT__entry(
- __field( u8, exception )
- __field( u8, has_error )
- __field( u32, error_code )
- ),
-
- TP_fast_assign(
- __entry->exception = exception;
- __entry->has_error = has_error;
- __entry->error_code = error_code;
- ),
-
- TP_printk("%s (0x%x)",
- __print_symbolic(__entry->exception, kvm_trace_sym_exc),
- /* FIXME: don't print error_code if not present */
- __entry->has_error ? __entry->error_code : 0)
-);
-
/*
* Tracepoint for page fault.
*/
@@ -452,34 +413,12 @@ TRACE_EVENT(kvm_nested_vmrun,
),
TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
- "event_inj: 0x%08x npt: %s",
+ "event_inj: 0x%08x npt: %s\n",
__entry->rip, __entry->vmcb, __entry->nested_rip,
__entry->int_ctl, __entry->event_inj,
__entry->npt ? "on" : "off")
);
-TRACE_EVENT(kvm_nested_intercepts,
- TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
- TP_ARGS(cr_read, cr_write, exceptions, intercept),
-
- TP_STRUCT__entry(
- __field( __u16, cr_read )
- __field( __u16, cr_write )
- __field( __u32, exceptions )
- __field( __u64, intercept )
- ),
-
- TP_fast_assign(
- __entry->cr_read = cr_read;
- __entry->cr_write = cr_write;
- __entry->exceptions = exceptions;
- __entry->intercept = intercept;
- ),
-
- TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
- __entry->cr_read, __entry->cr_write, __entry->exceptions,
- __entry->intercept)
-);
/*
* Tracepoint for #VMEXIT while nested
*/
@@ -508,7 +447,7 @@ TRACE_EVENT(kvm_nested_vmexit,
__entry->exit_int_info_err = exit_int_info_err;
),
TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
- "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
__entry->rip,
ftrace_print_symbols_seq(p, __entry->exit_code,
kvm_x86_ops->exit_reasons_str),
@@ -543,7 +482,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
),
TP_printk("reason: %s ext_inf1: 0x%016llx "
- "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
ftrace_print_symbols_seq(p, __entry->exit_code,
kvm_x86_ops->exit_reasons_str),
__entry->exit_info1, __entry->exit_info2,
@@ -565,7 +504,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
__entry->rip = rip
),
- TP_printk("rip: 0x%016llx", __entry->rip)
+ TP_printk("rip: 0x%016llx\n", __entry->rip)
);
/*
@@ -587,7 +526,7 @@ TRACE_EVENT(kvm_invlpga,
__entry->address = address;
),
- TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
+ TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
__entry->rip, __entry->asid, __entry->address)
);
@@ -608,102 +547,11 @@ TRACE_EVENT(kvm_skinit,
__entry->slb = slb;
),
- TP_printk("rip: 0x%016llx slb: 0x%08x",
+ TP_printk("rip: 0x%016llx slb: 0x%08x\n",
__entry->rip, __entry->slb)
);
-#define __print_insn(insn, ilen) ({ \
- int i; \
- const char *ret = p->buffer + p->len; \
- \
- for (i = 0; i < ilen; ++i) \
- trace_seq_printf(p, " %02x", insn[i]); \
- trace_seq_printf(p, "%c", 0); \
- ret; \
- })
-
-#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
-#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
-#define KVM_EMUL_INSN_F_CS_D (1 << 2)
-#define KVM_EMUL_INSN_F_CS_L (1 << 3)
-
-#define kvm_trace_symbol_emul_flags \
- { 0, "real" }, \
- { KVM_EMUL_INSN_F_CR0_PE \
- | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
- { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
- { KVM_EMUL_INSN_F_CR0_PE \
- | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
- { KVM_EMUL_INSN_F_CR0_PE \
- | KVM_EMUL_INSN_F_CS_L, "prot64" }
-
-#define kei_decode_mode(mode) ({ \
- u8 flags = 0xff; \
- switch (mode) { \
- case X86EMUL_MODE_REAL: \
- flags = 0; \
- break; \
- case X86EMUL_MODE_VM86: \
- flags = KVM_EMUL_INSN_F_EFL_VM; \
- break; \
- case X86EMUL_MODE_PROT16: \
- flags = KVM_EMUL_INSN_F_CR0_PE; \
- break; \
- case X86EMUL_MODE_PROT32: \
- flags = KVM_EMUL_INSN_F_CR0_PE \
- | KVM_EMUL_INSN_F_CS_D; \
- break; \
- case X86EMUL_MODE_PROT64: \
- flags = KVM_EMUL_INSN_F_CR0_PE \
- | KVM_EMUL_INSN_F_CS_L; \
- break; \
- } \
- flags; \
- })
-
-TRACE_EVENT(kvm_emulate_insn,
- TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
- TP_ARGS(vcpu, failed),
-
- TP_STRUCT__entry(
- __field( __u64, rip )
- __field( __u32, csbase )
- __field( __u8, len )
- __array( __u8, insn, 15 )
- __field( __u8, flags )
- __field( __u8, failed )
- ),
-
- TP_fast_assign(
- __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
- __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
- __entry->len = vcpu->arch.emulate_ctxt.decode.eip
- - vcpu->arch.emulate_ctxt.decode.fetch.start;
- memcpy(__entry->insn,
- vcpu->arch.emulate_ctxt.decode.fetch.data,
- 15);
- __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
- __entry->failed = failed;
- ),
-
- TP_printk("%x:%llx:%s (%s)%s",
- __entry->csbase, __entry->rip,
- __print_insn(__entry->insn, __entry->len),
- __print_symbolic(__entry->flags,
- kvm_trace_symbol_emul_flags),
- __entry->failed ? " failed" : ""
- )
- );
-
-#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
-#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
-
#endif /* _TRACE_KVM_H */
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH arch/x86/kvm
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
/* This part must be outside protection */
#include <trace/define_trace.h>
diff --git a/linux/x86/vmx.c b/linux/x86/vmx.c
index ed4cf97..e29837e 100644
--- a/linux/x86/vmx.c
+++ b/linux/x86/vmx.c
@@ -45,7 +45,6 @@
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
@@ -68,7 +67,6 @@
#include <linux/moduleparam.h>
#include <linux/ftrace_event.h>
#include <linux/slab.h>
-#include <linux/tboot.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -77,14 +75,12 @@
#include <asm/vmx.h>
#include <asm/virtext.h>
#include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
#include "trace.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
-MODULE_INFO(version, "kvm-kmod-2.6.38-rc7");
+MODULE_INFO(version, "kvm-kmod-2.6.34");
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -107,12 +103,6 @@ module_param_named(unrestricted_guest,
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
-static int __read_mostly vmm_exclusive = 1;
-module_param(vmm_exclusive, bool, S_IRUGO);
-
-static int __read_mostly yield_on_hlt = 1;
-module_param(yield_on_hlt, bool, S_IRUGO);
-
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
@@ -149,8 +139,6 @@ module_param(ple_gap, int, S_IRUGO);
static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
-#define NR_AUTOLOAD_MSRS 1
-
struct vmcs {
u32 revision_id;
u32 abort;
@@ -169,7 +157,6 @@ struct vcpu_vmx {
unsigned long host_rsp;
int launched;
u8 fail;
- u32 exit_intr_info;
u32 idt_vectoring_info;
struct shared_msr_entry *guest_msrs;
int nmsrs;
@@ -179,11 +166,6 @@ struct vcpu_vmx {
u64 msr_guest_kernel_gs_base;
#endif
struct vmcs *vmcs;
- struct msr_autoload {
- unsigned nr;
- struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
- struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
- } msr_autoload;
struct {
int loaded;
u16 fs_sel, gs_sel, ldt_sel;
@@ -199,6 +181,11 @@ struct vcpu_vmx {
u32 limit;
u32 ar;
} tr, es, ds, fs, gs;
+ struct {
+ bool pending;
+ u8 vector;
+ unsigned rip;
+ } irq;
} rmode;
int vpid;
bool emulation_required;
@@ -219,22 +206,16 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
-static void kvm_cpu_vmxon(u64 addr);
-static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
-static DEFINE_PER_CPU(struct kvm_desc_ptr, host_gdt);
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
-static bool cpu_has_load_ia32_efer;
-
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -283,67 +264,67 @@ static u64 host_efer;
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
/*
- * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
+ * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
* away by decrementing the array size.
*/
static const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
- MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
-static inline bool is_page_fault(u32 intr_info)
+static inline int is_page_fault(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline bool is_no_device(u32 intr_info)
+static inline int is_no_device(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline bool is_invalid_opcode(u32 intr_info)
+static inline int is_invalid_opcode(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline bool is_external_interrupt(u32 intr_info)
+static inline int is_external_interrupt(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
}
-static inline bool is_machine_check(u32 intr_info)
+static inline int is_machine_check(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline bool cpu_has_vmx_msr_bitmap(void)
+static inline int cpu_has_vmx_msr_bitmap(void)
{
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
}
-static inline bool cpu_has_vmx_tpr_shadow(void)
+static inline int cpu_has_vmx_tpr_shadow(void)
{
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
-static inline bool vm_need_tpr_shadow(struct kvm *kvm)
+static inline int vm_need_tpr_shadow(struct kvm *kvm)
{
return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
}
-static inline bool cpu_has_secondary_exec_ctrls(void)
+static inline int cpu_has_secondary_exec_ctrls(void)
{
return vmcs_config.cpu_based_exec_ctrl &
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -363,105 +344,84 @@ static inline bool cpu_has_vmx_flexpriority(void)
static inline bool cpu_has_vmx_ept_execute_only(void)
{
- return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
+ return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
}
static inline bool cpu_has_vmx_eptp_uncacheable(void)
{
- return vmx_capability.ept & VMX_EPTP_UC_BIT;
+ return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
}
static inline bool cpu_has_vmx_eptp_writeback(void)
{
- return vmx_capability.ept & VMX_EPTP_WB_BIT;
+ return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
}
static inline bool cpu_has_vmx_ept_2m_page(void)
{
- return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
+ return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
}
static inline bool cpu_has_vmx_ept_1g_page(void)
{
- return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
-}
-
-static inline bool cpu_has_vmx_ept_4levels(void)
-{
- return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
-}
-
-static inline bool cpu_has_vmx_invept_individual_addr(void)
-{
- return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
+ return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
}
-static inline bool cpu_has_vmx_invept_context(void)
+static inline int cpu_has_vmx_invept_individual_addr(void)
{
- return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
+ return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
}
-static inline bool cpu_has_vmx_invept_global(void)
+static inline int cpu_has_vmx_invept_context(void)
{
- return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
+ return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
}
-static inline bool cpu_has_vmx_invvpid_single(void)
+static inline int cpu_has_vmx_invept_global(void)
{
- return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+ return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
}
-static inline bool cpu_has_vmx_invvpid_global(void)
-{
- return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
-}
-
-static inline bool cpu_has_vmx_ept(void)
+static inline int cpu_has_vmx_ept(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_EPT;
}
-static inline bool cpu_has_vmx_unrestricted_guest(void)
+static inline int cpu_has_vmx_unrestricted_guest(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_UNRESTRICTED_GUEST;
}
-static inline bool cpu_has_vmx_ple(void)
+static inline int cpu_has_vmx_ple(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
-static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
return flexpriority_enabled && irqchip_in_kernel(kvm);
}
-static inline bool cpu_has_vmx_vpid(void)
+static inline int cpu_has_vmx_vpid(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_VPID;
}
-static inline bool cpu_has_vmx_rdtscp(void)
+static inline int cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDTSCP;
}
-static inline bool cpu_has_virtual_nmis(void)
+static inline int cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
-static inline bool cpu_has_vmx_wbinvd_exit(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_WBINVD_EXITING;
-}
-
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -519,26 +479,13 @@ static void vmcs_clear(struct vmcs *vmcs)
u8 error;
asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
- : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
vmcs, phys_addr);
}
-static void vmcs_load(struct vmcs *vmcs)
-{
- u64 phys_addr = __pa(vmcs);
- u8 error;
-
- asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
- : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc", "memory");
- if (error)
- printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
- vmcs, phys_addr);
-}
-
static void __vcpu_clear(void *arg)
{
struct vcpu_vmx *vmx = arg;
@@ -548,6 +495,7 @@ static void __vcpu_clear(void *arg)
vmcs_clear(vmx->vmcs);
if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
+ rdtscll(vmx->vcpu.arch.host_tsc);
list_del(&vmx->local_vcpus_link);
vmx->vcpu.cpu = -1;
vmx->launched = 0;
@@ -560,27 +508,12 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
}
-static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
{
if (vmx->vpid == 0)
return;
- if (cpu_has_vmx_invvpid_single())
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
-}
-
-static inline void vpid_sync_vcpu_global(void)
-{
- if (cpu_has_vmx_invvpid_global())
- __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
-}
-
-static inline void vpid_sync_context(struct vcpu_vmx *vmx)
-{
- if (cpu_has_vmx_invvpid_single())
- vpid_sync_vcpu_single(vmx);
- else
- vpid_sync_vcpu_global();
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
}
static inline void ept_sync_global(void)
@@ -612,10 +545,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
static unsigned long vmcs_readl(unsigned long field)
{
- unsigned long value = 0;
+ unsigned long value;
asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
- : "+a"(value) : "d"(field) : "cc");
+ : "=a"(value) : "d"(field) : "cc");
return value;
}
@@ -703,69 +636,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
vmcs_write32(EXCEPTION_BITMAP, eb);
}
-static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
-{
- unsigned i;
- struct msr_autoload *m = &vmx->msr_autoload;
-
- if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
- vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
- vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
- return;
- }
-
- for (i = 0; i < m->nr; ++i)
- if (m->guest[i].index == msr)
- break;
-
- if (i == m->nr)
- return;
- --m->nr;
- m->guest[i] = m->guest[m->nr];
- m->host[i] = m->host[m->nr];
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
-}
-
-static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
- u64 guest_val, u64 host_val)
-{
- unsigned i;
- struct msr_autoload *m = &vmx->msr_autoload;
-
- if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
- vmcs_write64(GUEST_IA32_EFER, guest_val);
- vmcs_write64(HOST_IA32_EFER, host_val);
- vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
- vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
- return;
- }
-
- for (i = 0; i < m->nr; ++i)
- if (m->guest[i].index == msr)
- break;
-
- if (i == m->nr) {
- ++m->nr;
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
- }
-
- m->guest[i].index = msr;
- m->guest[i].value = guest_val;
- m->host[i].index = msr;
- m->host[i].value = host_val;
-}
-
static void reload_tss(void)
{
/*
* VT restores TR but not its size. Useless.
*/
- struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt);
+ struct descriptor_table gdt;
struct kvm_desc_struct *descs;
- descs = (void *)gdt->address;
+ kvm_get_gdt(&gdt);
+ descs = (void *)gdt.base;
descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
load_TR_desc();
}
@@ -792,56 +672,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
guest_efer |= host_efer & ignore_bits;
vmx->guest_msrs[efer_offset].data = guest_efer;
vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
-
- clear_atomic_switch_msr(vmx, MSR_EFER);
- /* On ept, can't emulate nx, and must switch nx atomically */
- if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
- guest_efer = vmx->vcpu.arch.efer;
- if (!(guest_efer & EFER_LMA))
- guest_efer &= ~EFER_LME;
- add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
- return false;
- }
-
return true;
}
-static unsigned long segment_base(u16 selector)
-{
- struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt);
- struct kvm_desc_struct *d;
- unsigned long table_base;
- unsigned long v;
-
- if (!(selector & ~3))
- return 0;
-
- table_base = gdt->address;
-
- if (selector & 4) { /* from ldt */
- u16 ldt_selector = kvm_read_ldt();
-
- if (!(ldt_selector & ~3))
- return 0;
-
- table_base = segment_base(ldt_selector);
- }
- d = (struct kvm_desc_struct *)(table_base + (selector & ~7));
- v = kvm_get_desc_base(d);
-#ifdef CONFIG_X86_64
- if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((unsigned long)((struct kvm_ldttss_desc64 *)d)->base3) << 32;
-#endif
- return v;
-}
-
-static inline unsigned long kvm_read_tr_base(void)
-{
- u16 tr;
- asm("str %0" : "=g"(tr));
- return segment_base(tr);
-}
-
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -857,7 +690,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
*/
vmx->host_state.ldt_sel = kvm_read_ldt();
vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
- savesegment(fs, vmx->host_state.fs_sel);
+ vmx->host_state.fs_sel = kvm_read_fs();
if (!(vmx->host_state.fs_sel & 7)) {
vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
vmx->host_state.fs_reload_needed = 0;
@@ -865,7 +698,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
vmcs_write16(HOST_FS_SELECTOR, 0);
vmx->host_state.fs_reload_needed = 1;
}
- savesegment(gs, vmx->host_state.gs_sel);
+ vmx->host_state.gs_sel = kvm_read_gs();
if (!(vmx->host_state.gs_sel & 7))
vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
else {
@@ -882,9 +715,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
#endif
#ifdef CONFIG_X86_64
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
- if (is_long_mode(&vmx->vcpu))
+ if (is_long_mode(&vmx->vcpu)) {
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ }
#endif
for (i = 0; i < vmx->save_nmsrs; ++i)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
@@ -894,32 +728,35 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
static void __vmx_load_host_state(struct vcpu_vmx *vmx)
{
+ unsigned long flags;
+
if (!vmx->host_state.loaded)
return;
++vmx->vcpu.stat.host_state_reload;
vmx->host_state.loaded = 0;
-#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu))
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
-#endif
+ if (vmx->host_state.fs_reload_needed)
+ kvm_load_fs(vmx->host_state.fs_sel);
if (vmx->host_state.gs_ldt_reload_needed) {
kvm_load_ldt(vmx->host_state.ldt_sel);
+ /*
+ * If we have to reload gs, we must take care to
+ * preserve our gs base.
+ */
+ local_irq_save(flags);
+ kvm_load_gs(vmx->host_state.gs_sel);
#ifdef CONFIG_X86_64
- load_gs_index(vmx->host_state.gs_sel);
-#else
- loadsegment(gs, vmx->host_state.gs_sel);
+ wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
#endif
+ local_irq_restore(flags);
}
- if (vmx->host_state.fs_reload_needed)
- loadsegment(fs, vmx->host_state.fs_sel);
reload_tss();
#ifdef CONFIG_X86_64
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ if (is_long_mode(&vmx->vcpu)) {
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ }
#endif
- if (current_thread_info()->status & TS_USEDFPU)
- clts();
- kvm_load_gdt(&__get_cpu_var(host_gdt));
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -936,47 +773,62 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ u64 phys_addr = __pa(vmx->vmcs);
+ u64 tsc_this, delta, new_offset;
- if (!vmm_exclusive)
- kvm_cpu_vmxon(phys_addr);
- else if (vcpu->cpu != cpu)
+ if (vcpu->cpu != cpu) {
vcpu_clear(vmx);
+ kvm_migrate_timers(vcpu);
+ set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+ local_irq_disable();
+ list_add(&vmx->local_vcpus_link,
+ &per_cpu(vcpus_on_cpu, cpu));
+ local_irq_enable();
+ }
if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
+ u8 error;
+
per_cpu(current_vmcs, cpu) = vmx->vmcs;
- vmcs_load(vmx->vmcs);
+ asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+ : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "cc");
+ if (error)
+ printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+ vmx->vmcs, phys_addr);
}
if (vcpu->cpu != cpu) {
- struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt);
+ struct descriptor_table dt;
unsigned long sysenter_esp;
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- local_irq_disable();
- list_add(&vmx->local_vcpus_link,
- &per_cpu(vcpus_on_cpu, cpu));
- local_irq_enable();
-
+ vcpu->cpu = cpu;
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
* processors.
*/
vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
- vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
+ kvm_get_gdt(&dt);
+ vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+ /*
+ * Make sure the time stamp counter is monotonous.
+ */
+ rdtscll(tsc_this);
+ if (tsc_this < vcpu->arch.host_tsc) {
+ delta = vcpu->arch.host_tsc - tsc_this;
+ new_offset = vmcs_read64(TSC_OFFSET) + delta;
+ vmcs_write64(TSC_OFFSET, new_offset);
+ }
}
}
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
__vmx_load_host_state(to_vmx(vcpu));
- if (!vmm_exclusive) {
- __vcpu_clear(to_vmx(vcpu));
- kvm_cpu_vmxoff();
- }
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1035,9 +887,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
int ret = 0;
if (interruptibility & GUEST_INTR_STATE_STI)
- ret |= KVM_X86_SHADOW_INT_STI;
+ ret |= X86_SHADOW_INT_STI;
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
- ret |= KVM_X86_SHADOW_INT_MOV_SS;
+ ret |= X86_SHADOW_INT_MOV_SS;
return ret & mask;
}
@@ -1049,9 +901,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
- if (mask & KVM_X86_SHADOW_INT_MOV_SS)
+ if (mask & X86_SHADOW_INT_MOV_SS)
interruptibility |= GUEST_INTR_STATE_MOV_SS;
- else if (mask & KVM_X86_SHADOW_INT_STI)
+ if (mask & X86_SHADOW_INT_STI)
interruptibility |= GUEST_INTR_STATE_STI;
if ((interruptibility != interruptibility_old))
@@ -1070,20 +922,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
}
-static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
-{
- /* Ensure that we clear the HLT state in the VMCS. We don't need to
- * explicitly skip the instruction because if the HLT state is set, then
- * the instruction is already executing and RIP has already been
- * advanced. */
- if (!yield_on_hlt &&
- vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
- vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
-}
-
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code,
- bool reinject)
+ bool has_error_code, u32 error_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info = nr | INTR_INFO_VALID_MASK;
@@ -1094,8 +934,16 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
}
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ vmx->rmode.irq.pending = true;
+ vmx->rmode.irq.vector = nr;
+ vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+ if (kvm_exception_is_soft(nr))
+ vmx->rmode.irq.rip +=
+ vmx->vcpu.arch.event_exit_inst_len;
+ intr_info |= INTR_TYPE_SOFT_INTR;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+ kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
return;
}
@@ -1107,7 +955,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
- vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
@@ -1154,10 +1001,10 @@ static void setup_msrs(struct vcpu_vmx *vmx)
if (index >= 0 && vmx->rdtscp_enabled)
move_msr_up(vmx, index, save_nmsrs++);
/*
- * MSR_STAR is only needed on long mode guests, and only
+ * MSR_K6_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
*/
- index = __find_msr_index(vmx, MSR_STAR);
+ index = __find_msr_index(vmx, MSR_K6_STAR);
if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
move_msr_up(vmx, index, save_nmsrs++);
}
@@ -1192,17 +1039,12 @@ static u64 guest_read_tsc(void)
}
/*
- * writes 'offset' into guest's timestamp counter offset register
+ * writes 'guest_tsc' into guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
*/
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
-{
- vmcs_write64(TSC_OFFSET, offset);
-}
-
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
{
- u64 offset = vmcs_read64(TSC_OFFSET);
- vmcs_write64(TSC_OFFSET, offset + adjustment);
+ vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
}
/*
@@ -1275,6 +1117,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
+ u64 host_tsc;
int ret = 0;
switch (msr_index) {
@@ -1304,7 +1147,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, data);
+ rdtscll(host_tsc);
+ guest_write_tsc(data, host_tsc);
break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -1373,58 +1217,37 @@ static __init int vmx_disabled_by_bios(void)
u64 msr;
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- if (msr & FEATURE_CONTROL_LOCKED) {
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && kvm_tboot_enabled())
- return 1;
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && !kvm_tboot_enabled()) {
- printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
- " activate TXT before enabling KVM\n");
- return 1;
- }
- }
-
- return 0;
+ return (msr & (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED))
+ == FEATURE_CONTROL_LOCKED;
/* locked but not enabled */
}
-static void kvm_cpu_vmxon(u64 addr)
-{
- asm volatile (ASM_VMX_VMXON_RAX
- : : "a"(&addr), "m"(addr)
- : "memory", "cc");
-}
-
static int hardware_enable(void *garbage)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- u64 old, test_bits;
+ u64 old;
if (read_cr4() & X86_CR4_VMXE)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-
- test_bits = FEATURE_CONTROL_LOCKED;
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
- if (kvm_tboot_enabled())
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
-
- if ((old & test_bits) != test_bits) {
+ if ((old & (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED))
+ != (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED))
/* enable and lock */
- wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
- }
+ wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
+ FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED);
write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+ asm volatile (ASM_VMX_VMXON_RAX
+ : : "a"(&phys_addr), "m"(phys_addr)
+ : "memory", "cc");
- if (vmm_exclusive) {
- kvm_cpu_vmxon(phys_addr);
- ept_sync_global();
- }
-
- kvm_store_gdt(&__get_cpu_var(host_gdt));
+ ept_sync_global();
return 0;
}
@@ -1446,15 +1269,13 @@ static void vmclear_local_vcpus(void)
static void kvm_cpu_vmxoff(void)
{
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+ write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
static void hardware_disable(void *garbage)
{
- if (vmm_exclusive) {
- vmclear_local_vcpus();
- kvm_cpu_vmxoff();
- }
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ vmclear_local_vcpus();
+ kvm_cpu_vmxoff();
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -1476,14 +1297,6 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
-static __init bool allow_1_setting(u32 msr, u32 ctl)
-{
- u32 vmx_msr_low, vmx_msr_high;
-
- rdmsr(msr, vmx_msr_low, vmx_msr_high);
- return vmx_msr_high & ctl;
-}
-
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
{
u32 vmx_msr_low, vmx_msr_high;
@@ -1500,7 +1313,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&_pin_based_exec_control) < 0)
return -EIO;
- min =
+ min = CPU_BASED_HLT_EXITING |
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING |
@@ -1513,10 +1326,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING;
-
- if (yield_on_hlt)
- min |= CPU_BASED_HLT_EXITING;
-
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1598,12 +1407,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
- cpu_has_load_ia32_efer =
- allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
- VM_ENTRY_LOAD_IA32_EFER)
- && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
- VM_EXIT_LOAD_IA32_EFER);
-
return 0;
}
@@ -1671,8 +1474,7 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_vpid())
enable_vpid = 0;
- if (!cpu_has_vmx_ept() ||
- !cpu_has_vmx_ept_4levels()) {
+ if (!cpu_has_vmx_ept()) {
enable_ept = 0;
enable_unrestricted_guest = 0;
}
@@ -1760,8 +1562,8 @@ static gva_t rmode_tss_base(struct kvm *kvm)
struct kvm_memslots *slots;
gfn_t base_gfn;
- slots = kvm_memslots(kvm);
- base_gfn = slots->memslots[0].base_gfn +
+ slots = rcu_dereference(kvm->memslots);
+ base_gfn = kvm->memslots->memslots[0].base_gfn +
kvm->memslots->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
}
@@ -1777,13 +1579,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
save->limit = vmcs_read32(sf->limit);
save->ar = vmcs_read32(sf->ar_bytes);
vmcs_write16(sf->selector, save->base >> 4);
- vmcs_write32(sf->base, save->base & 0xffff0);
+ vmcs_write32(sf->base, save->base & 0xfffff);
vmcs_write32(sf->limit, 0xffff);
vmcs_write32(sf->ar_bytes, 0xf3);
- if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not paragraph"
- " aligned when entering protected mode (seg=%d)",
- seg);
}
static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1881,27 +1679,26 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
(guest_tr_ar & ~AR_TYPE_MASK)
| AR_TYPE_BUSY_64_TSS);
}
- vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
+ vcpu->arch.efer |= EFER_LMA;
+ vmx_set_efer(vcpu, vcpu->arch.efer);
}
static void exit_lmode(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.efer &= ~EFER_LMA;
+
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS)
& ~VM_ENTRY_IA32E_MODE);
- vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
}
#endif
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
{
- vpid_sync_context(to_vmx(vcpu));
- if (enable_ept) {
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
+ vpid_sync_vcpu_all(to_vmx(vcpu));
+ if (enable_ept)
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
- }
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -1912,13 +1709,6 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}
-static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
-{
- if (enable_ept && is_paging(vcpu))
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
-}
-
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1934,20 +1724,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
return;
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
- vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
- vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
- vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
+ vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
+ vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
+ vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
+ vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
}
}
static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
{
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
- vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
- vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
- vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+ vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
}
__set_bit(VCPU_EXREG_PDPTR,
@@ -1962,7 +1752,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
- vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -2043,7 +1832,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
- guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
+ guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
vcpu->kvm->arch.ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
@@ -2186,28 +1975,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
*l = (ar >> 13) & 1;
}
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
- dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
- dt->address = vmcs_readl(GUEST_IDTR_BASE);
+ dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ dt->base = vmcs_readl(GUEST_IDTR_BASE);
}
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
- vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
- vmcs_writel(GUEST_IDTR_BASE, dt->address);
+ vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
+ vmcs_writel(GUEST_IDTR_BASE, dt->base);
}
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
- dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
- dt->address = vmcs_readl(GUEST_GDTR_BASE);
+ dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ dt->base = vmcs_readl(GUEST_GDTR_BASE);
}
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt)
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
- vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
- vmcs_writel(GUEST_GDTR_BASE, dt->address);
+ vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
+ vmcs_writel(GUEST_GDTR_BASE, dt->base);
}
static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2548,16 +2337,6 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
spin_unlock(&vmx_vpid_lock);
}
-static void free_vpid(struct vcpu_vmx *vmx)
-{
- if (!enable_vpid)
- return;
- spin_lock(&vmx_vpid_lock);
- if (vmx->vpid != 0)
- __clear_bit(vmx->vpid, vmx_vpid_bitmap);
- spin_unlock(&vmx_vpid_lock);
-}
-
static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
{
int f = sizeof(unsigned long);
@@ -2594,9 +2373,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
u32 host_sysenter_cs, msr_low, msr_high;
u32 junk;
- u64 host_pat;
+ u64 host_pat, tsc_this, tsc_base;
unsigned long a;
- struct kvm_desc_ptr dt;
+ struct descriptor_table dt;
int i;
unsigned long kvm_vmx_return;
u32 exec_control;
@@ -2655,15 +2434,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
- vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
+ vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
- vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
+ vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
+ vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
#ifdef CONFIG_X86_64
rdmsrl(MSR_FS_BASE, a);
@@ -2677,16 +2456,14 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
- kvm_native_store_idt(&dt);
- vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
+ kvm_get_idt(&dt);
+ vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
- vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -2735,34 +2512,33 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
- kvm_write_tsc(&vmx->vcpu, 0);
+ tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
+ rdtscll(tsc_this);
+ if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
+ tsc_base = tsc_this;
+
+ guest_write_tsc(0, tsc_base);
return 0;
}
static int init_rmode(struct kvm *kvm)
{
- int idx, ret = 0;
-
- idx = srcu_read_lock(&kvm->srcu);
if (!init_rmode_tss(kvm))
- goto exit;
+ return 0;
if (!init_rmode_identity_map(kvm))
- goto exit;
-
- ret = 1;
-exit:
- srcu_read_unlock(&kvm->srcu, idx);
- return ret;
+ return 0;
+ return 1;
}
static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 msr;
- int ret;
+ int ret, idx;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
+ idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
@@ -2779,9 +2555,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
- ret = fx_init(&vmx->vcpu);
- if (ret != 0)
- goto out;
+ fx_init(&vmx->vcpu);
seg_setup(VCPU_SREG_CS);
/*
@@ -2831,7 +2605,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
- vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+ vmcs_write32(GUEST_ACTIVITY_STATE, 0);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
@@ -2864,7 +2638,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx_fpu_activate(&vmx->vcpu);
update_exception_bitmap(&vmx->vcpu);
- vpid_sync_context(vmx);
+ vpid_sync_vcpu_all(vmx);
ret = 0;
@@ -2872,6 +2646,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->emulation_required = 0;
out:
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
}
@@ -2893,10 +2668,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
return;
}
- if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
- enable_irq_window(vcpu);
- return;
- }
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2912,8 +2683,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
++vcpu->stat.irq_injections;
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ vmx->rmode.irq.pending = true;
+ vmx->rmode.irq.vector = irq;
+ vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+ if (vcpu->arch.interrupt.soft)
+ vmx->rmode.irq.rip +=
+ vmx->vcpu.arch.event_exit_inst_len;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+ kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
return;
}
intr = irq | INTR_INFO_VALID_MASK;
@@ -2924,7 +2703,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
- vmx_clear_hlt(vcpu);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2946,13 +2724,18 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
++vcpu->stat.nmi_injections;
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ vmx->rmode.irq.pending = true;
+ vmx->rmode.irq.vector = NMI_VECTOR;
+ vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ NMI_VECTOR | INTR_TYPE_SOFT_INTR |
+ INTR_INFO_VALID_MASK);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+ kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
return;
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
- vmx_clear_hlt(vcpu);
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2961,15 +2744,16 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
return 0;
return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
- | GUEST_INTR_STATE_NMI));
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
}
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
if (!cpu_has_virtual_nmis())
return to_vmx(vcpu)->soft_vnmi_blocked;
- return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+ else
+ return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ GUEST_INTR_STATE_NMI);
}
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3023,7 +2807,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
+ if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
return 1;
/*
* Forward all other exceptions that are valid in real mode.
@@ -3120,7 +2904,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
if (is_invalid_opcode(intr_info)) {
- er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
+ er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
@@ -3139,7 +2923,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
- return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
+ return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
if (vmx->rmode.vm86_active &&
@@ -3204,20 +2988,22 @@ static int handle_io(struct kvm_vcpu *vcpu)
int size, in, string;
unsigned port;
+ ++vcpu->stat.io_exits;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
string = (exit_qualification & 16) != 0;
- in = (exit_qualification & 8) != 0;
- ++vcpu->stat.io_exits;
-
- if (string || in)
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ if (string) {
+ if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
+ return 0;
+ return 1;
+ }
- port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
- skip_emulated_instruction(vcpu);
+ in = (exit_qualification & 8) != 0;
+ port = exit_qualification >> 16;
- return kvm_fast_pio_out(vcpu, size, port);
+ skip_emulated_instruction(vcpu);
+ return kvm_emulate_pio(vcpu, in, size, port);
}
static void
@@ -3236,7 +3022,6 @@ static int handle_cr(struct kvm_vcpu *vcpu)
unsigned long exit_qualification, val;
int cr;
int reg;
- int err;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
cr = exit_qualification & 15;
@@ -3247,22 +3032,22 @@ static int handle_cr(struct kvm_vcpu *vcpu)
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- err = kvm_set_cr0(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
+ kvm_set_cr0(vcpu, val);
+ skip_emulated_instruction(vcpu);
return 1;
case 3:
- err = kvm_set_cr3(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
+ kvm_set_cr3(vcpu, val);
+ skip_emulated_instruction(vcpu);
return 1;
case 4:
- err = kvm_set_cr4(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
+ kvm_set_cr4(vcpu, val);
+ skip_emulated_instruction(vcpu);
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
u8 cr8 = kvm_register_read(vcpu, reg);
- err = kvm_set_cr8(vcpu, cr8);
- kvm_complete_insn_gp(vcpu, err);
+ kvm_set_cr8(vcpu, cr8);
+ skip_emulated_instruction(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
return 1;
if (cr8_prev <= cr8)
@@ -3281,9 +3066,8 @@ static int handle_cr(struct kvm_vcpu *vcpu)
case 1: /*mov from cr*/
switch (cr) {
case 3:
- val = kvm_read_cr3(vcpu);
- kvm_register_write(vcpu, reg, val);
- trace_kvm_cr_read(cr, val);
+ kvm_register_write(vcpu, reg, vcpu->arch.cr3);
+ trace_kvm_cr_read(cr, vcpu->arch.cr3);
skip_emulated_instruction(vcpu);
return 1;
case 8:
@@ -3310,9 +3094,19 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return 0;
}
+static int check_dr_alias(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return -1;
+ }
+ return 0;
+}
+
static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
+ unsigned long val;
int dr, reg;
/* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3347,20 +3141,67 @@ static int handle_dr(struct kvm_vcpu *vcpu)
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
- unsigned long val;
- if (!kvm_get_dr(vcpu, dr, &val))
- kvm_register_write(vcpu, reg, val);
- } else
- kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
+ switch (dr) {
+ case 0 ... 3:
+ val = vcpu->arch.db[dr];
+ break;
+ case 4:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
+ case 6:
+ val = vcpu->arch.dr6;
+ break;
+ case 5:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
+ default: /* 7 */
+ val = vcpu->arch.dr7;
+ break;
+ }
+ kvm_register_write(vcpu, reg, val);
+ } else {
+ val = vcpu->arch.regs[reg];
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[dr] = val;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = val;
+ break;
+ case 4:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
+ case 6:
+ if (val & 0xffffffff00000000ULL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+ break;
+ case 5:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
+ default: /* 7 */
+ if (val & 0xffffffff00000000ULL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+ vcpu->arch.switch_db_regs =
+ (val & DR7_BP_EN_MASK);
+ }
+ break;
+ }
+ }
skip_emulated_instruction(vcpu);
return 1;
}
-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
-{
- vmcs_writel(GUEST_DR7, val);
-}
-
static int handle_cpuid(struct kvm_vcpu *vcpu)
{
kvm_emulate_cpuid(vcpu);
@@ -3406,7 +3247,6 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
@@ -3419,8 +3259,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
++vcpu->stat.irq_window_exits;
/*
@@ -3455,11 +3293,6 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
-}
-
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3472,31 +3305,34 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
- kvm_emulate_wbinvd(vcpu);
+ /* TODO: Add support for VT-d/pass-through device */
return 1;
}
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
+static int handle_apic_access(struct kvm_vcpu *vcpu)
{
- u64 new_bv = kvm_read_edx_eax(vcpu);
- u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ unsigned long exit_qualification;
+ enum emulation_result er;
+ unsigned long offset;
- if (kvm_set_xcr(vcpu, index, new_bv) == 0)
- skip_emulated_instruction(vcpu);
- return 1;
-}
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ offset = exit_qualification & 0xffful;
-static int handle_apic_access(struct kvm_vcpu *vcpu)
-{
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ er = emulate_instruction(vcpu, 0, 0, 0);
+
+ if (er != EMULATE_DONE) {
+ printk(KERN_ERR
+ "Fail to handle apic access vmexit! Offset is 0x%lx\n",
+ offset);
+ return -ENOEXEC;
+ }
+ return 1;
}
static int handle_task_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification;
- bool has_error_code = false;
- u32 error_code = 0;
u16 tss_selector;
int reason, type, idt_v;
@@ -3519,13 +3355,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
kvm_clear_interrupt_queue(vcpu);
break;
case INTR_TYPE_HARD_EXCEPTION:
- if (vmx->idt_vectoring_info &
- VECTORING_INFO_DELIVER_CODE_MASK) {
- has_error_code = true;
- error_code =
- vmcs_read32(IDT_VECTORING_ERROR_CODE);
- }
- /* fall through */
case INTR_TYPE_SOFT_EXCEPTION:
kvm_clear_exception_queue(vcpu);
break;
@@ -3540,13 +3369,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
type != INTR_TYPE_NMI_INTR))
skip_emulated_instruction(vcpu);
- if (kvm_task_switch(vcpu, tss_selector, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ if (!kvm_task_switch(vcpu, tss_selector, reason))
return 0;
- }
/* clear all local breakpoint enable flags */
vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
@@ -3587,7 +3411,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+ return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
}
static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3682,7 +3506,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
++vcpu->stat.nmi_window_exits;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
@@ -3692,26 +3515,22 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
enum emulation_result err = EMULATE_DONE;
int ret = 1;
- u32 cpu_exec_ctrl;
- bool intr_window_requested;
-
- cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
while (!guest_state_valid(vcpu)) {
- if (intr_window_requested
- && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
- return handle_interrupt_window(&vmx->vcpu);
-
- err = emulate_instruction(vcpu, 0);
+ err = emulate_instruction(vcpu, 0, 0, 0);
if (err == EMULATE_DO_MMIO) {
ret = 0;
goto out;
}
- if (err != EMULATE_DONE)
- return 0;
+ if (err != EMULATE_DONE) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ ret = 0;
+ goto out;
+ }
if (signal_pending(current))
goto out;
@@ -3760,7 +3579,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
- [EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3775,7 +3593,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
- [EXIT_REASON_XSETBV] = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
@@ -3788,12 +3605,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
-{
- *info1 = vmcs_readl(EXIT_QUALIFICATION);
- *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
-}
-
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@@ -3804,18 +3615,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+ trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
/* If guest state is invalid, start emulating */
if (vmx->emulation_required && emulate_invalid_guest_state)
return handle_invalid_guest_state(vcpu);
- if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
- vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- vcpu->run->fail_entry.hardware_entry_failure_reason
- = exit_reason;
- return 0;
- }
+ /* Access CR3 don't cause VMExit in paging mode, so we need
+ * to sync with guest real CR3. */
+ if (enable_ept && is_paging(vcpu))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
if (unlikely(vmx->fail)) {
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -3870,9 +3679,18 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
vmcs_write32(TPR_THRESHOLD, irr);
}
-static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
- u32 exit_intr_info = vmx->exit_intr_info;
+ u32 exit_intr_info;
+ u32 idt_vectoring_info = vmx->idt_vectoring_info;
+ bool unblock_nmi;
+ u8 vector;
+ int type;
+ bool idtv_info_valid;
+
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
/* Handle machine checks before interrupts are enabled */
if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -3882,21 +3700,10 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
/* We need to handle NMIs before interrupts are enabled */
if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK)) {
- kvm_before_handle_nmi(&vmx->vcpu);
+ (exit_intr_info & INTR_INFO_VALID_MASK))
asm("int $2");
- kvm_after_handle_nmi(&vmx->vcpu);
- }
-}
-
-static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
-{
- u32 exit_intr_info = vmx->exit_intr_info;
- bool unblock_nmi;
- u8 vector;
- bool idtv_info_valid;
- idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
if (cpu_has_virtual_nmis()) {
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
@@ -3918,18 +3725,6 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
} else if (unlikely(vmx->soft_vnmi_blocked))
vmx->vnmi_blocked_time +=
ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
-}
-
-static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
- u32 idt_vectoring_info,
- int instr_len_field,
- int error_code_field)
-{
- u8 vector;
- int type;
- bool idtv_info_valid;
-
- idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
vmx->vcpu.arch.nmi_injected = false;
kvm_clear_exception_queue(&vmx->vcpu);
@@ -3938,8 +3733,6 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
if (!idtv_info_valid)
return;
- kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
-
vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
@@ -3956,18 +3749,18 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
break;
case INTR_TYPE_SOFT_EXCEPTION:
vmx->vcpu.arch.event_exit_inst_len =
- vmcs_read32(instr_len_field);
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
/* fall through */
case INTR_TYPE_HARD_EXCEPTION:
if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- u32 err = vmcs_read32(error_code_field);
+ u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
kvm_queue_exception_e(&vmx->vcpu, vector, err);
} else
kvm_queue_exception(&vmx->vcpu, vector);
break;
case INTR_TYPE_SOFT_INTR:
vmx->vcpu.arch.event_exit_inst_len =
- vmcs_read32(instr_len_field);
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
/* fall through */
case INTR_TYPE_EXT_INTR:
kvm_queue_interrupt(&vmx->vcpu, vector,
@@ -3978,21 +3771,27 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
}
}
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
-{
- __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
- VM_EXIT_INSTRUCTION_LEN,
- IDT_VECTORING_ERROR_CODE);
-}
-
-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+/*
+ * Failure to inject an interrupt should give us the information
+ * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
+ * when fetching the interrupt redirection bitmap in the real-mode
+ * tss, this doesn't happen. So we do it ourselves.
+ */
+static void fixup_rmode_irq(struct vcpu_vmx *vmx)
{
- __vmx_complete_interrupts(to_vmx(vcpu),
- vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
- VM_ENTRY_INSTRUCTION_LEN,
- VM_ENTRY_EXCEPTION_ERROR_CODE);
-
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+ vmx->rmode.irq.pending = 0;
+ if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
+ return;
+ kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
+ if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+ vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
+ vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+ return;
+ }
+ vmx->idt_vectoring_info =
+ VECTORING_INFO_VALID_MASK
+ | INTR_TYPE_EXT_INTR
+ | vmx->rmode.irq.vector;
}
#ifdef CONFIG_X86_64
@@ -4029,6 +3828,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ /*
+ * Loading guest fpu may have cleared host cr0.ts
+ */
+ vmcs_writel(HOST_CR0, read_cr0());
+
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
@@ -4119,27 +3923,23 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
: "cc", "memory"
- , R"ax", R"bx", R"di", R"si"
+ , R"bx", R"di", R"si"
#ifdef CONFIG_X86_64
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#endif
);
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_PDPTR)
- | (1 << VCPU_EXREG_CR3));
+ | (1 << VCPU_EXREG_PDPTR));
vcpu->arch.regs_dirty = 0;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ if (vmx->rmode.irq.pending)
+ fixup_rmode_irq(vmx);
asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->launched = 1;
- vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- vmx_complete_atomic_exit(vmx);
- vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
@@ -4161,26 +3961,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- free_vpid(vmx);
+ spin_lock(&vmx_vpid_lock);
+ if (vmx->vpid != 0)
+ __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+ spin_unlock(&vmx_vpid_lock);
vmx_free_vmcs(vcpu);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
}
-static inline void vmcs_init(struct vmcs *vmcs)
-{
- u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
-
- if (!vmm_exclusive)
- kvm_cpu_vmxon(phys_addr);
-
- vmcs_clear(vmcs);
-
- if (!vmm_exclusive)
- kvm_cpu_vmxoff();
-}
-
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
@@ -4206,11 +3996,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx->vmcs)
goto free_msrs;
- vmcs_init(vmx->vmcs);
+ vmcs_clear(vmx->vmcs);
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
- vmx->vcpu.cpu = cpu;
err = vmx_vcpu_setup(vmx);
vmx_vcpu_put(&vmx->vcpu);
put_cpu();
@@ -4237,7 +4026,6 @@ free_msrs:
uninit_vcpu:
kvm_vcpu_uninit(&vmx->vcpu);
free_vcpu:
- free_vpid(vmx);
kmem_cache_free(kvm_vcpu_cache, vmx);
return ERR_PTR(err);
}
@@ -4341,6 +4129,11 @@ static int vmx_get_lpage_level(void)
return PT_PDPE_LEVEL;
}
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -4363,10 +4156,6 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
}
}
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-{
-}
-
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -4394,7 +4183,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
- .decache_cr3 = vmx_decache_cr3,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
@@ -4404,7 +4192,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
- .set_dr7 = vmx_set_dr7,
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
@@ -4422,7 +4209,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_irq = vmx_inject_irq,
.set_nmi = vmx_inject_nmi,
.queue_exception = vmx_queue_exception,
- .cancel_injection = vmx_cancel_injection,
.interrupt_allowed = vmx_interrupt_allowed,
.nmi_allowed = vmx_nmi_allowed,
.get_nmi_mask = vmx_get_nmi_mask,
@@ -4435,23 +4221,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_tdp_level = get_ept_level,
.get_mt_mask = vmx_get_mt_mask,
- .get_exit_info = vmx_get_exit_info,
.exit_reasons_str = vmx_exit_reasons_str,
-
.get_lpage_level = vmx_get_lpage_level,
.cpuid_update = vmx_cpuid_update,
.rdtscp_supported = vmx_rdtscp_supported,
-
- .set_supported_cpuid = vmx_set_supported_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .write_tsc_offset = vmx_write_tsc_offset,
- .adjust_tsc_offset = vmx_adjust_tsc_offset,
-
- .set_tdp_cr3 = vmx_set_cr3,
};
static int __init vmx_init(void)
@@ -4499,8 +4274,7 @@ static int __init vmx_init(void)
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
if (r)
goto out3;
@@ -4513,6 +4287,8 @@ static int __init vmx_init(void)
if (enable_ept) {
bypass_guest_pf = 0;
+ kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
+ VMX_EPT_WRITABLE_MASK);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();
diff --git a/linux/x86/x86.c b/linux/x86/x86.c
index a6233d2..1071014 100644
--- a/linux/x86/x86.c
+++ b/linux/x86/x86.c
@@ -46,7 +46,6 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
@@ -81,23 +80,17 @@
#include <linux/user-return-notifier.h>
#include <linux/srcu.h>
#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <linux/uaccess.h>
-#include <linux/hash.h>
#include <trace/events/kvm.h>
-
+#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
#include "trace.h"
#include <asm/debugreg.h>
+#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/pvclock.h>
-#include <asm/div64.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -108,13 +101,12 @@
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXSAVE \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
#define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
@@ -172,7 +164,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "halt_exits", VCPU_STAT(halt_exits) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
- { "request_irq", VCPU_STAT(request_irq_exits) },
+ { "kvm_request_irq", VCPU_STAT(request_irq_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "efer_reload", VCPU_STAT(efer_reload) },
@@ -194,15 +186,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
-u64 __read_mostly host_xcr0;
-
-static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
-{
- int i;
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
- vcpu->arch.apf.gfns[i] = ~0;
-}
-
static void kvm_on_user_return(struct kvm_user_return_notifier *urn)
{
unsigned slot;
@@ -280,6 +263,34 @@ static void drop_user_return_notifiers(void *ignore)
kvm_on_user_return(&smsr->urn);
}
+unsigned long segment_base(u16 selector)
+{
+ struct descriptor_table gdt;
+ struct kvm_desc_struct *d;
+ unsigned long table_base;
+ unsigned long v;
+
+ if (selector == 0)
+ return 0;
+
+ kvm_get_gdt(&gdt);
+ table_base = gdt.base;
+
+ if (selector & 4) { /* from ldt */
+ u16 ldt_selector = kvm_read_ldt();
+
+ table_base = segment_base(ldt_selector);
+ }
+ d = (struct kvm_desc_struct *)(table_base + (selector & ~7));
+ v = kvm_get_desc_base(d);
+#ifdef CONFIG_X86_64
+ if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+ v |= ((unsigned long)((struct kvm_ldttss_desc64 *)d)->base3) << 32;
+#endif
+ return v;
+}
+EXPORT_SYMBOL_GPL(segment_base);
+
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
@@ -321,21 +332,17 @@ static int exception_class(int vector)
}
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
- unsigned nr, bool has_error, u32 error_code,
- bool reinject)
+ unsigned nr, bool has_error, u32 error_code)
{
u32 prev_nr;
int class1, class2;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
if (!vcpu->arch.exception.pending) {
queue:
vcpu->arch.exception.pending = true;
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.nr = nr;
vcpu->arch.exception.error_code = error_code;
- vcpu->arch.exception.reinject = reinject;
return;
}
@@ -343,7 +350,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
prev_nr = vcpu->arch.exception.nr;
if (prev_nr == DF_VECTOR) {
/* triple fault -> shutdown */
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
return;
}
class1 = exception_class(prev_nr);
@@ -364,59 +371,30 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- kvm_multiple_exception(vcpu, nr, false, 0, false);
+ kvm_multiple_exception(vcpu, nr, false, 0);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
-void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
-{
- kvm_multiple_exception(vcpu, nr, false, 0, true);
-}
-EXPORT_SYMBOL_GPL(kvm_requeue_exception);
-
-void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
-{
- if (err)
- kvm_inject_gp(vcpu, 0);
- else
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
-
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+ u32 error_code)
{
++vcpu->stat.pf_guest;
- vcpu->arch.cr2 = fault->address;
- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
-}
-
-void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
-{
- if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
- vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
- else
- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+ vcpu->arch.cr2 = addr;
+ kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.nmi_pending = 1;
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- kvm_multiple_exception(vcpu, nr, true, error_code, false);
+ kvm_multiple_exception(vcpu, nr, true, error_code);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
-void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
-{
- kvm_multiple_exception(vcpu, nr, true, error_code, true);
-}
-EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
-
/*
* Checks if cpl <= required_cpl; if true, return true. Otherwise queue
* a #GP and return false.
@@ -431,49 +409,18 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
EXPORT_SYMBOL_GPL(kvm_require_cpl);
/*
- * This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_read_guest_page is that this function
- * can read from guest physical or from the guest's guest physical memory.
- */
-int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gfn_t ngfn, void *data, int offset, int len,
- u32 access)
-{
- gfn_t real_gfn;
- gpa_t ngpa;
-
- ngpa = gfn_to_gpa(ngfn);
- real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
- if (real_gfn == UNMAPPED_GVA)
- return -EFAULT;
-
- real_gfn = gpa_to_gfn(real_gfn);
-
- return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-
-int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- void *data, int offset, int len, u32 access)
-{
- return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
- data, offset, len, access);
-}
-
-/*
* Load the pae pdptrs. Return true is they are all valid.
*/
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
int i;
int ret;
- u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
- ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
- offset * sizeof(u64), sizeof(pdpte),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
+ ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
+ offset * sizeof(u64), sizeof(pdpte));
if (ret < 0) {
ret = 0;
goto out;
@@ -487,7 +434,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
}
ret = 1;
- memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
+ memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail);
__set_bit(VCPU_EXREG_PDPTR,
@@ -500,10 +447,8 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
static bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
- u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
bool changed = true;
- int offset;
- gfn_t gfn;
int r;
if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -513,181 +458,132 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_avail))
return true;
- gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
- offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
- r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
- PFERR_USER_MASK | PFERR_WRITE_MASK);
+ r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
+ changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
out:
return changed;
}
-int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- unsigned long old_cr0 = kvm_read_cr0(vcpu);
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
- X86_CR0_CD | X86_CR0_NW;
-
cr0 |= X86_CR0_ET;
#ifdef CONFIG_X86_64
- if (cr0 & 0xffffffff00000000UL)
- return 1;
+ if (cr0 & 0xffffffff00000000UL) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
#endif
cr0 &= ~CR0_RESERVED_BITS;
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
- return 1;
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
- return 1;
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
if ((vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
- if (!is_pae(vcpu))
- return 1;
+ if (!is_pae(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l)
- return 1;
+ if (cs_l) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+
+ }
} else
#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
- return 1;
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
}
kvm_x86_ops->set_cr0(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
- if ((cr0 ^ old_cr0) & X86_CR0_PG)
- kvm_clear_async_pf_completion_queue(vcpu);
-
- if ((cr0 ^ old_cr0) & update_bits)
- kvm_mmu_reset_context(vcpu);
- return 0;
+ kvm_mmu_reset_context(vcpu);
+ return;
}
EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+ kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- u64 xcr0;
-
- /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
- if (index != XCR_XFEATURE_ENABLED_MASK)
- return 1;
- xcr0 = xcr;
- if (kvm_x86_ops->get_cpl(vcpu) != 0)
- return 1;
- if (!(xcr0 & XSTATE_FP))
- return 1;
- if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
- return 1;
- if (xcr0 & ~host_xcr0)
- return 1;
- vcpu->arch.xcr0 = xcr0;
- vcpu->guest_xcr0_loaded = 0;
- return 0;
-}
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
-{
- if (__kvm_set_xcr(vcpu, index, xcr)) {
+ if (cr4 & CR4_RESERVED_BITS) {
kvm_inject_gp(vcpu, 0);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
-
-static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- return best && (best->ecx & bit(X86_FEATURE_XSAVE));
-}
-
-static void update_cpuid(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- if (!best)
return;
-
- /* Update OSXSAVE bit */
- if (kvm_cpu_has_xsave && best->function == 0x1) {
- best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
- best->ecx |= bit(X86_FEATURE_OSXSAVE);
}
-}
-
-int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
-
- if (cr4 & CR4_RESERVED_BITS)
- return 1;
-
- if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
- return 1;
if (is_long_mode(vcpu)) {
- if (!(cr4 & X86_CR4_PAE))
- return 1;
+ if (!(cr4 & X86_CR4_PAE)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
- return 1;
-
- if (cr4 & X86_CR4_VMXE)
- return 1;
+ && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ if (cr4 & X86_CR4_VMXE) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
kvm_x86_ops->set_cr4(vcpu, cr4);
-
- if ((cr4 ^ old_cr4) & pdptr_bits)
- kvm_mmu_reset_context(vcpu);
-
- if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
- update_cpuid(vcpu);
-
- return 0;
+ vcpu->arch.cr4 = cr4;
+ vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
+ kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
-int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
- if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
+ if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
- return 0;
+ return;
}
if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS)
- return 1;
+ if (cr3 & CR3_L_MODE_RESERVED_BITS) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
} else {
if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS)
- return 1;
- if (is_paging(vcpu) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
- return 1;
+ if (cr3 & CR3_PAE_RESERVED_BITS) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
}
/*
* We don't check reserved bits in nonpae mode, because
@@ -705,23 +601,24 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
* to debug) behavior on the guest side.
*/
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- return 1;
- vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
- vcpu->arch.mmu.new_cr3(vcpu);
- return 0;
+ kvm_inject_gp(vcpu, 0);
+ else {
+ vcpu->arch.cr3 = cr3;
+ vcpu->arch.mmu.new_cr3(vcpu);
+ }
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
-int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- if (cr8 & CR8_RESERVED_BITS)
- return 1;
+ if (cr8 & CR8_RESERVED_BITS) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
- return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr8);
@@ -734,90 +631,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
-static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+static inline u32 bit(int bitno)
{
- switch (dr) {
- case 0 ... 3:
- vcpu->arch.db[dr] = val;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- vcpu->arch.eff_db[dr] = val;
- break;
- case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1; /* #UD */
- /* fall through */
- case 6:
- if (val & 0xffffffff00000000ULL)
- return -1; /* #GP */
- vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
- break;
- case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1; /* #UD */
- /* fall through */
- default: /* 7 */
- if (val & 0xffffffff00000000ULL)
- return -1; /* #GP */
- vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
- vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
- }
- break;
- }
-
- return 0;
+ return 1 << (bitno & 31);
}
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
-{
- int res;
-
- res = __kvm_set_dr(vcpu, dr, val);
- if (res > 0)
- kvm_queue_exception(vcpu, UD_VECTOR);
- else if (res < 0)
- kvm_inject_gp(vcpu, 0);
-
- return res;
-}
-EXPORT_SYMBOL_GPL(kvm_set_dr);
-
-static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
-{
- switch (dr) {
- case 0 ... 3:
- *val = vcpu->arch.db[dr];
- break;
- case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1;
- /* fall through */
- case 6:
- *val = vcpu->arch.dr6;
- break;
- case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1;
- /* fall through */
- default: /* 7 */
- *val = vcpu->arch.dr7;
- break;
- }
-
- return 0;
-}
-
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
-{
- if (_kvm_get_dr(vcpu, dr, val)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_get_dr);
-
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -827,67 +645,67 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 8
+#define KVM_SAVE_MSRS_BEGIN 5
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+ HV_X64_MSR_APIC_ASSIST_PAGE,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_STAR,
+ MSR_K6_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+ MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
};
static unsigned num_msrs_to_save;
static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
- MSR_IA32_MCG_STATUS,
- MSR_IA32_MCG_CTL,
};
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- u64 old_efer = vcpu->arch.efer;
-
- if (efer & efer_reserved_bits)
- return 1;
+ if (efer & efer_reserved_bits) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
if (is_paging(vcpu)
- && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
- return 1;
+ && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
if (efer & EFER_FFXSR) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
- return 1;
+ if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
}
if (efer & EFER_SVME) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
- return 1;
+ if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
}
+ kvm_x86_ops->set_efer(vcpu, efer);
+
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- kvm_x86_ops->set_efer(vcpu, efer);
+ vcpu->arch.efer = efer;
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
- kvm_mmu_reset_context(vcpu);
-
- return 0;
+ kvm_mmu_reset_context(vcpu);
}
void kvm_enable_efer_bits(u64 mask)
@@ -917,28 +735,20 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
- int version;
- int r;
+ static int version;
struct pvclock_wall_clock wc;
struct timespec boot;
if (!wall_clock)
return;
- r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
- if (r)
- return;
-
- if (version & 1)
- ++version; /* first time write, random junk */
-
- ++version;
+ version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
/*
* The guest calculates current wall clock time by adding
- * system time (updated by kvm_guest_time_update below) to the
+ * system time (updated by kvm_write_guest_time below) to the
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
@@ -966,230 +776,64 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
return quotient;
}
-static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
- s8 *pshift, u32 *pmultiplier)
+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
{
- uint64_t scaled64;
+ uint64_t nsecs = 1000000000LL;
int32_t shift = 0;
uint64_t tps64;
uint32_t tps32;
- tps64 = base_khz * 1000LL;
- scaled64 = scaled_khz * 1000LL;
- while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
+ tps64 = tsc_khz * 1000LL;
+ while (tps64 > nsecs*2) {
tps64 >>= 1;
shift--;
}
tps32 = (uint32_t)tps64;
- while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
- if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
- scaled64 >>= 1;
- else
- tps32 <<= 1;
+ while (tps32 <= (uint32_t)nsecs) {
+ tps32 <<= 1;
shift++;
}
- *pshift = shift;
- *pmultiplier = div_frac(scaled64, tps32);
+ hv_clock->tsc_shift = shift;
+ hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
- pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
- __func__, base_khz, scaled_khz, shift, *pmultiplier);
-}
-
-static inline u64 get_kernel_ns(void)
-{
- struct timespec ts;
-
- WARN_ON(preemptible());
- ktime_get_ts(&ts);
- kvm_monotonic_to_bootbased(&ts);
- return timespec_to_ns(&ts);
+ pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
+ __func__, tsc_khz, hv_clock->tsc_shift,
+ hv_clock->tsc_to_system_mul);
}
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
-
-static inline int kvm_tsc_changes_freq(void)
-{
- int cpu = get_cpu();
- int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
- cpufreq_quick_get(cpu) != 0;
- put_cpu();
- return ret;
-}
-
-static inline u64 nsec_to_cycles(u64 nsec)
-{
- u64 ret;
-
- WARN_ON(preemptible());
- if (kvm_tsc_changes_freq())
- printk_once(KERN_WARNING
- "kvm: unreliable cycle conversion on adjustable rate TSC\n");
- ret = nsec * kvm___this_cpu_read(cpu_tsc_khz);
- do_div(ret, USEC_PER_SEC);
- return ret;
-}
-static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
-{
- /* Compute a scale to convert nanoseconds in TSC cycles */
- kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
- &kvm->arch.virtual_tsc_shift,
- &kvm->arch.virtual_tsc_mult);
- kvm->arch.virtual_tsc_khz = this_tsc_khz;
-}
-
-static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
-{
- u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
- vcpu->kvm->arch.virtual_tsc_mult,
- vcpu->kvm->arch.virtual_tsc_shift);
- tsc += vcpu->arch.last_tsc_write;
- return tsc;
-}
-
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
-{
- struct kvm *kvm = vcpu->kvm;
- u64 offset, ns, elapsed;
- unsigned long flags;
- s64 sdiff;
-
- spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = data - kvm_native_read_tsc();
- ns = get_kernel_ns();
- elapsed = ns - kvm->arch.last_tsc_nsec;
- sdiff = data - kvm->arch.last_tsc_write;
- if (sdiff < 0)
- sdiff = -sdiff;
-
- /*
- * Special case: close write to TSC within 5 seconds of
- * another CPU is interpreted as an attempt to synchronize
- * The 5 seconds is to accomodate host load / swapping as
- * well as any reset of TSC during the boot process.
- *
- * In that case, for a reliable TSC, we can match TSC offsets,
- * or make a best guest using elapsed value.
- */
- if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
- elapsed < 5ULL * NSEC_PER_SEC) {
- if (!kvm_check_tsc_unstable()) {
- offset = kvm->arch.last_tsc_offset;
- pr_debug("kvm: matched tsc offset for %llu\n", data);
- } else {
- u64 delta = nsec_to_cycles(elapsed);
- offset += delta;
- pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
- }
- ns = kvm->arch.last_tsc_nsec;
- }
- kvm->arch.last_tsc_nsec = ns;
- kvm->arch.last_tsc_write = data;
- kvm->arch.last_tsc_offset = offset;
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
- spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
-
- /* Reset of TSC must disable overshoot protection below */
- vcpu->arch.hv_clock.tsc_timestamp = 0;
- vcpu->arch.last_tsc_write = data;
- vcpu->arch.last_tsc_nsec = ns;
-}
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
-
-static int kvm_guest_time_update(struct kvm_vcpu *v)
+static void kvm_write_guest_time(struct kvm_vcpu *v)
{
+ struct timespec ts;
unsigned long flags;
struct kvm_vcpu_arch *vcpu = &v->arch;
void *shared_kaddr;
unsigned long this_tsc_khz;
- s64 kernel_ns, max_kernel_ns;
- u64 tsc_timestamp;
- /* Keep irq disabled to prevent changes to the clock */
- local_irq_save(flags);
- kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
- kernel_ns = get_kernel_ns();
- this_tsc_khz = kvm___this_cpu_read(cpu_tsc_khz);
+ if ((!vcpu->time_page))
+ return;
- if (unlikely(this_tsc_khz == 0)) {
- local_irq_restore(flags);
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
- return 1;
- }
-
- /*
- * We may have to catch up the TSC to match elapsed wall clock
- * time for two reasons, even if kvmclock is used.
- * 1) CPU could have been running below the maximum TSC rate
- * 2) Broken TSC compensation resets the base at each VCPU
- * entry to avoid unknown leaps of TSC even when running
- * again on the same CPU. This may cause apparent elapsed
- * time to disappear, and the guest to stand still or run
- * very slowly.
- */
- if (vcpu->tsc_catchup) {
- u64 tsc = compute_guest_tsc(v, kernel_ns);
- if (tsc > tsc_timestamp) {
- kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
- tsc_timestamp = tsc;
- }
+ this_tsc_khz = get_cpu_var(cpu_tsc_khz);
+ if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+ kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+ vcpu->hv_clock_tsc_khz = this_tsc_khz;
}
+ put_cpu_var(cpu_tsc_khz);
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
+ ktime_get_ts(&ts);
+ kvm_monotonic_to_bootbased(&ts);
local_irq_restore(flags);
- if (!vcpu->time_page)
- return 0;
-
- /*
- * Time as measured by the TSC may go backwards when resetting the base
- * tsc_timestamp. The reason for this is that the TSC resolution is
- * higher than the resolution of the other clock scales. Thus, many
- * possible measurments of the TSC correspond to one measurement of any
- * other clock, and so a spread of values is possible. This is not a
- * problem for the computation of the nanosecond clock; with TSC rates
- * around 1GHZ, there can only be a few cycles which correspond to one
- * nanosecond value, and any path through this code will inevitably
- * take longer than that. However, with the kernel_ns value itself,
- * the precision may be much lower, down to HZ granularity. If the
- * first sampling of TSC against kernel_ns ends in the low part of the
- * range, and the second in the high end of the range, we can get:
- *
- * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
- *
- * As the sampling errors potentially range in the thousands of cycles,
- * it is possible such a time value has already been observed by the
- * guest. To protect against this, we must compute the system time as
- * observed by the guest and ensure the new system time is greater.
- */
- max_kernel_ns = 0;
- if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
- max_kernel_ns = vcpu->last_guest_tsc -
- vcpu->hv_clock.tsc_timestamp;
- max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
- vcpu->hv_clock.tsc_to_system_mul,
- vcpu->hv_clock.tsc_shift);
- max_kernel_ns += vcpu->last_kernel_ns;
- }
-
- if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
- kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
- &vcpu->hv_clock.tsc_shift,
- &vcpu->hv_clock.tsc_to_system_mul);
- vcpu->hw_tsc_khz = this_tsc_khz;
- }
-
- if (max_kernel_ns > kernel_ns)
- kernel_ns = max_kernel_ns;
-
/* With all the info we got, fill in the values */
- vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
- vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
- vcpu->last_kernel_ns = kernel_ns;
- vcpu->last_guest_tsc = tsc_timestamp;
- vcpu->hv_clock.flags = 0;
+
+ vcpu->hv_clock.system_time = ts.tv_nsec +
+ (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
/*
* The interface expects us to write an even number signaling that the
@@ -1206,7 +850,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
kunmap_atomic(shared_kaddr, KM_USER0);
mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
- return 0;
+}
+
+static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+
+ if (!vcpu->time_page)
+ return 0;
+ set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+ return 1;
}
static bool msr_mtrr_valid(unsigned msr)
@@ -1469,38 +1122,14 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 0;
}
-static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
-{
- gpa_t gpa = data & ~0x3f;
-
- /* Bits 2:5 are resrved, Should be zero */
- if (data & 0x3c)
- return 1;
-
- vcpu->arch.apf.msr_val = data;
-
- if (!(data & KVM_ASYNC_PF_ENABLED)) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
- return 0;
- }
-
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
- return 1;
-
- vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
- kvm_async_pf_wakeup_all(vcpu);
- return 0;
-}
-
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
case MSR_EFER:
- return set_efer(vcpu, data);
+ set_efer(vcpu, data);
+ break;
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
- data &= ~(u64)0x100; /* ignore ignne emulation enable */
if (data != 0) {
pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
@@ -1543,12 +1172,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
- case MSR_KVM_WALL_CLOCK_NEW:
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
kvm_write_wall_clock(vcpu->kvm, data);
break;
- case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
if (vcpu->arch.time_page) {
kvm_release_page_dirty(vcpu->arch.time_page);
@@ -1556,7 +1183,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
vcpu->arch.time = data;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
if (!(data & 1))
@@ -1572,12 +1198,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
kvm_release_page_clean(vcpu->arch.time_page);
vcpu->arch.time_page = NULL;
}
+
+ kvm_request_guest_time_update(vcpu);
break;
}
- case MSR_KVM_ASYNC_PF_EN:
- if (kvm_pv_enable_async_pf(vcpu, data))
- return 1;
- break;
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1612,16 +1236,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
"0x%x data 0x%llx\n", msr, data);
break;
- case MSR_K7_CLK_CTL:
- /*
- * Ignore all writes to this no longer documented MSR.
- * Writes are only relevant for old K7 processors,
- * all pre-dating SVM, but a recommended workaround from
- * AMD for these chips. It is possible to speicify the
- * affected processor models on the command line, hence
- * the need to ignore the workaround.
- */
- break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
if (kvm_hv_msr_partition_wide(msr)) {
int r;
@@ -1814,20 +1428,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case 0xcd: /* fsb frequency */
data = 3;
break;
- /*
- * MSR_EBC_FREQUENCY_ID
- * Conservative value valid for even the basic CPU models.
- * Models 0,1: 000 in bits 23:21 indicating a bus speed of
- * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
- * and 266MHz for model 3, or 4. Set Core Clock
- * Frequency to System Bus Frequency Ratio to 1 (bits
- * 31:24) even though these are only valid for CPU
- * models > 2, however guests may end up dividing or
- * multiplying by zero otherwise.
- */
- case MSR_EBC_FREQUENCY_ID:
- data = 1 << 24;
- break;
case MSR_IA32_APICBASE:
data = kvm_get_apic_base(vcpu);
break;
@@ -1847,16 +1447,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
- case MSR_KVM_WALL_CLOCK_NEW:
data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
- case MSR_KVM_SYSTEM_TIME_NEW:
data = vcpu->arch.time;
break;
- case MSR_KVM_ASYNC_PF_EN:
- data = vcpu->arch.apf.msr_val;
- break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@@ -1864,18 +1459,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
return get_msr_mce(vcpu, msr, pdata);
- case MSR_K7_CLK_CTL:
- /*
- * Provide expected ramp-up count for K7. All other
- * are set to zero, indicating minimum divisors for
- * every field.
- *
- * This prevents guest kernels on AMD host with CPU
- * type 6, model 8 and higher from exploding due to
- * the rdmsr failing.
- */
- data = 0x20000000;
- break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
if (kvm_hv_msr_partition_wide(msr)) {
int r;
@@ -1913,11 +1496,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
{
int i, idx;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ vcpu_load(vcpu);
+
+ idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ vcpu_put(vcpu);
return i;
}
@@ -1947,7 +1534,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *user_msrs,
r = -ENOMEM;
size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
- entries = kmalloc(size, GFP_KERNEL);
+ entries = vmalloc(size);
if (!entries)
goto out;
@@ -1966,7 +1553,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *user_msrs,
r = n;
out_free:
- kfree(entries);
+ vfree(entries);
out:
return r;
}
@@ -1988,7 +1575,6 @@ int kvm_dev_ioctl_check_extension(long ext)
#ifdef CONFIG_MMU_NOTIFIER
case KVM_CAP_SYNC_MMU:
#endif
- case KVM_CAP_USER_NMI:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -2006,12 +1592,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_HYPERV_VAPIC:
case KVM_CAP_HYPERV_SPIN:
case KVM_CAP_PCI_SEGMENT:
- case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
- case KVM_CAP_XSAVE:
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38)
- case KVM_CAP_ASYNC_PF:
-#endif
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2035,9 +1616,6 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
break;
- case KVM_CAP_XCRS:
- r = kvm_cpu_has_xsave;
- break;
default:
r = 0;
break;
@@ -2114,51 +1692,22 @@ out:
return r;
}
-static void wbinvd_ipi(void *garbage)
-{
- wbinvd();
-}
-
-static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
-{
- return vcpu->kvm->arch.iommu_domain &&
- !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
-}
-
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- /* Address WBINVD may be executed by guest */
- if (need_emulate_wbinvd(vcpu)) {
- if (kvm_x86_ops->has_wbinvd_exit())
- cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
- smp_call_function_single(vcpu->cpu,
- wbinvd_ipi, NULL, 1);
- }
-
kvm_x86_ops->vcpu_load(vcpu, cpu);
- if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
- /* Make sure TSC doesn't go backwards */
- s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
- kvm_native_read_tsc() - vcpu->arch.last_host_tsc;
- if (tsc_delta < 0)
- mark_tsc_unstable("KVM discovered backwards TSC");
- if (kvm_check_tsc_unstable()) {
- kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
- vcpu->arch.tsc_catchup = 1;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- }
- if (vcpu->cpu != cpu)
- kvm_migrate_timers(vcpu);
- vcpu->cpu = cpu;
+ if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
+ unsigned long khz = cpufreq_quick_get(cpu);
+ if (!khz)
+ khz = tsc_khz;
+ per_cpu(cpu_tsc_khz, cpu) = khz;
}
+ kvm_request_guest_time_update(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
- vcpu->arch.last_host_tsc = kvm_native_read_tsc();
+ kvm_x86_ops->vcpu_put(vcpu);
}
static int is_efer_nx(void)
@@ -2207,6 +1756,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
if (copy_from_user(cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry)))
goto out_free;
+ vcpu_load(vcpu);
for (i = 0; i < cpuid->nent; i++) {
vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -2224,7 +1774,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
r = 0;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- update_cpuid(vcpu);
+ vcpu_put(vcpu);
out_free:
vfree(cpuid_entries);
@@ -2245,10 +1795,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
+ vcpu_load(vcpu);
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- update_cpuid(vcpu);
+ vcpu_put(vcpu);
return 0;
out:
@@ -2275,11 +1826,6 @@ out:
return r;
}
-static void cpuid_mask(u32 *word, int wordnum)
-{
- *word &= boot_cpu_data.x86_capability[wordnum];
-}
-
static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index)
{
@@ -2328,20 +1874,19 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
- F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+ F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
- F(F16C);
+ 0 /* Reserved, XSAVE, OSXSAVE */;
/* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
- F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+ F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
- F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
- 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+ F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
+ 0 /* SKINIT */ | 0 /* WDT */;
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -2350,13 +1895,11 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
switch (function) {
case 0:
- entry->eax = min(entry->eax, (u32)0xd);
+ entry->eax = min(entry->eax, (u32)0xb);
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
- cpuid_mask(&entry->edx, 0);
entry->ecx &= kvm_supported_word4_x86_features;
- cpuid_mask(&entry->ecx, 4);
/* we support x2apic emulation even if host does not support
* it since we emulate x2apic in software */
entry->ecx |= F(X2APIC);
@@ -2410,51 +1953,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
- case 0xd: {
- int i;
-
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- for (i = 1; *nent < maxnent; ++i) {
- if (entry[i - 1].eax == 0 && i != 2)
- break;
- do_cpuid_1_ent(&entry[i], function, i);
- entry[i].flags |=
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- ++*nent;
- }
- break;
- }
- case KVM_CPUID_SIGNATURE: {
- char signature[12] = "KVMKVMKVM\0\0";
- u32 *sigptr = (u32 *)signature;
- entry->eax = 0;
- entry->ebx = sigptr[0];
- entry->ecx = sigptr[1];
- entry->edx = sigptr[2];
- break;
- }
- case KVM_CPUID_FEATURES:
- entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
- (1 << KVM_FEATURE_NOP_IO_DELAY) |
- (1 << KVM_FEATURE_CLOCKSOURCE2) |
- (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
- entry->ebx = 0;
- entry->ecx = 0;
- entry->edx = 0;
- break;
case 0x80000000:
entry->eax = min(entry->eax, 0x8000001a);
break;
case 0x80000001:
entry->edx &= kvm_supported_word1_x86_features;
- cpuid_mask(&entry->edx, 1);
entry->ecx &= kvm_supported_word6_x86_features;
- cpuid_mask(&entry->ecx, 6);
break;
}
-
- kvm_x86_ops->set_supported_cpuid(function, entry);
-
put_cpu();
}
@@ -2490,23 +1996,6 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
do_cpuid_ent(&cpuid_entries[nent], func, 0,
&nent, cpuid->nent);
-
-
-
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
-
- do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
- cpuid->nent);
-
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
-
- do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
- cpuid->nent);
-
r = -E2BIG;
if (nent >= cpuid->nent)
goto out_free;
@@ -2527,7 +2016,9 @@ out:
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
+ vcpu_load(vcpu);
memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
+ vcpu_put(vcpu);
return 0;
}
@@ -2535,9 +2026,11 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
+ vcpu_load(vcpu);
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
kvm_apic_post_state_restore(vcpu);
update_cr8_intercept(vcpu);
+ vcpu_put(vcpu);
return 0;
}
@@ -2549,16 +2042,20 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return -EINVAL;
if (irqchip_in_kernel(vcpu->kvm))
return -ENXIO;
+ vcpu_load(vcpu);
kvm_queue_interrupt(vcpu, irq->irq, false);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vcpu_put(vcpu);
return 0;
}
static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
{
+ vcpu_load(vcpu);
kvm_inject_nmi(vcpu);
+ vcpu_put(vcpu);
return 0;
}
@@ -2624,7 +2121,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
printk(KERN_DEBUG "kvm: set_mce: "
"injects mce exception while "
"previous one is in progress!\n");
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
return 0;
}
if (banks[1] & MCI_STATUS_VAL)
@@ -2649,43 +2146,38 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
- events->exception.injected =
- vcpu->arch.exception.pending &&
- !kvm_exception_is_soft(vcpu->arch.exception.nr);
+ vcpu_load(vcpu);
+
+ events->exception.injected = vcpu->arch.exception.pending;
events->exception.nr = vcpu->arch.exception.nr;
events->exception.has_error_code = vcpu->arch.exception.has_error_code;
- events->exception.pad = 0;
events->exception.error_code = vcpu->arch.exception.error_code;
- events->interrupt.injected =
- vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
+ events->interrupt.injected = vcpu->arch.interrupt.pending;
events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.soft = 0;
- events->interrupt.shadow =
- kvm_x86_ops->get_interrupt_shadow(vcpu,
- KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
+ events->interrupt.soft = vcpu->arch.interrupt.soft;
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending;
events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
- events->nmi.pad = 0;
events->sipi_vector = vcpu->arch.sipi_vector;
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR
- | KVM_VCPUEVENT_VALID_SHADOW);
- memset(&events->reserved, 0, sizeof(events->reserved));
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
+
+ vcpu_put(vcpu);
}
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR
- | KVM_VCPUEVENT_VALID_SHADOW))
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
return -EINVAL;
+ vcpu_load(vcpu);
+
vcpu->arch.exception.pending = events->exception.injected;
vcpu->arch.exception.nr = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2696,9 +2188,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
kvm_pic_clear_isr_ack(vcpu->kvm);
- if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- kvm_x86_ops->set_interrupt_shadow(vcpu,
- events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -2708,134 +2197,34 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
vcpu->arch.sipi_vector = events->sipi_vector;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- return 0;
-}
-
-static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
-{
- memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
- dbgregs->dr6 = vcpu->arch.dr6;
- dbgregs->dr7 = vcpu->arch.dr7;
- dbgregs->flags = 0;
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
-}
-
-static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
-{
- if (dbgregs->flags)
- return -EINVAL;
-
- memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
- vcpu->arch.dr6 = dbgregs->dr6;
- vcpu->arch.dr7 = dbgregs->dr7;
+ vcpu_put(vcpu);
return 0;
}
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
-{
- if (kvm_cpu_has_xsave)
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state->xsave,
- kvm_xstate_size);
- else {
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state->fxsave,
- sizeof(struct kvm_i387_fxsave_struct));
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
- XSTATE_FPSSE;
- }
-}
-
-static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
-{
- u64 xstate_bv =
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
-
- if (kvm_cpu_has_xsave)
- memcpy(&vcpu->arch.guest_fpu.state->xsave,
- guest_xsave->region, kvm_xstate_size);
- else {
- if (xstate_bv & ~XSTATE_FPSSE)
- return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu.state->fxsave,
- guest_xsave->region, sizeof(struct kvm_i387_fxsave_struct));
- }
- return 0;
-}
-
-static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
- struct kvm_xcrs *guest_xcrs)
-{
- if (!kvm_cpu_has_xsave) {
- guest_xcrs->nr_xcrs = 0;
- return;
- }
-
- guest_xcrs->nr_xcrs = 1;
- guest_xcrs->flags = 0;
- guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
- guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
-}
-
-static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
- struct kvm_xcrs *guest_xcrs)
-{
- int i, r = 0;
-
- if (!kvm_cpu_has_xsave)
- return -EINVAL;
-
- if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
- return -EINVAL;
-
- for (i = 0; i < guest_xcrs->nr_xcrs; i++)
- /* Only support XCR0 currently */
- if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
- r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
- guest_xcrs->xcrs[0].value);
- break;
- }
- if (r)
- r = -EINVAL;
- return r;
-}
-
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void *argp = (void *)arg;
int r;
- union {
- struct kvm_lapic_state *lapic;
- struct kvm_xsave *xsave;
- struct kvm_xcrs *xcrs;
- void *buffer;
- } u;
+ struct kvm_lapic_state *lapic = NULL;
- u.buffer = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!u.lapic)
+ if (!lapic)
goto out;
- r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
+ if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
goto out;
r = 0;
break;
@@ -2844,14 +2233,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!u.lapic)
+ if (!lapic)
goto out;
r = -EFAULT;
- if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
+ if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
if (r)
goto out;
r = 0;
@@ -2991,90 +2380,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
break;
}
- case KVM_GET_DEBUGREGS: {
- struct kvm_debugregs dbgregs;
-
- kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
-
- r = -EFAULT;
- if (copy_to_user(argp, &dbgregs,
- sizeof(struct kvm_debugregs)))
- break;
- r = 0;
- break;
- }
- case KVM_SET_DEBUGREGS: {
- struct kvm_debugregs dbgregs;
-
- r = -EFAULT;
- if (copy_from_user(&dbgregs, argp,
- sizeof(struct kvm_debugregs)))
- break;
-
- r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
- break;
- }
- case KVM_GET_XSAVE: {
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xsave)
- break;
-
- kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
-
- r = -EFAULT;
- if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
- break;
- r = 0;
- break;
- }
- case KVM_SET_XSAVE: {
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xsave)
- break;
-
- r = -EFAULT;
- if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
- break;
-
- r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
- break;
- }
- case KVM_GET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xcrs)
- break;
-
- kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
-
- r = -EFAULT;
- if (copy_to_user(argp, u.xcrs,
- sizeof(struct kvm_xcrs)))
- break;
- r = 0;
- break;
- }
- case KVM_SET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xcrs)
- break;
-
- r = -EFAULT;
- if (copy_from_user(u.xcrs, argp,
- sizeof(struct kvm_xcrs)))
- break;
-
- r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
- break;
- }
default:
r = -EINVAL;
}
out:
- kfree(u.buffer);
+ kfree(lapic);
return r;
}
@@ -3114,7 +2424,116 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
{
- return kvm->arch.n_max_mmu_pages;
+ return kvm->arch.n_alloc_mmu_pages;
+}
+
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
+
+ aliases = rcu_dereference(kvm->arch.aliases);
+
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
+ if (alias->flags & KVM_ALIAS_INVALID)
+ continue;
+ if (gfn >= alias->base_gfn
+ && gfn < alias->base_gfn + alias->npages)
+ return alias->target_gfn + gfn - alias->base_gfn;
+ }
+ return gfn;
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
+
+ aliases = rcu_dereference(kvm->arch.aliases);
+
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
+ if (gfn >= alias->base_gfn
+ && gfn < alias->base_gfn + alias->npages)
+ return alias->target_gfn + gfn - alias->base_gfn;
+ }
+ return gfn;
+}
+
+/*
+ * Set a new alias region. Aliases map a portion of physical memory into
+ * another portion. This is useful for memory windows, for example the PC
+ * VGA region.
+ */
+static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
+ struct kvm_memory_alias *alias)
+{
+ int r, n;
+ struct kvm_mem_alias *p;
+ struct kvm_mem_aliases *aliases, *old_aliases;
+
+ r = -EINVAL;
+ /* General sanity checks */
+ if (alias->memory_size & (PAGE_SIZE - 1))
+ goto out;
+ if (alias->guest_phys_addr & (PAGE_SIZE - 1))
+ goto out;
+ if (alias->slot >= KVM_ALIAS_SLOTS)
+ goto out;
+ if (alias->guest_phys_addr + alias->memory_size
+ < alias->guest_phys_addr)
+ goto out;
+ if (alias->target_phys_addr + alias->memory_size
+ < alias->target_phys_addr)
+ goto out;
+
+ r = -ENOMEM;
+ aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!aliases)
+ goto out;
+
+ mutex_lock(&kvm->slots_lock);
+
+ /* invalidate any gfn reference in case of deletion/shrinking */
+ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
+ aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
+ old_aliases = kvm->arch.aliases;
+ rcu_assign_pointer(kvm->arch.aliases, aliases);
+ kvm_synchronize_srcu_expedited(&kvm->srcu);
+ kvm_mmu_zap_all(kvm);
+ kfree(old_aliases);
+
+ r = -ENOMEM;
+ aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!aliases)
+ goto out_unlock;
+
+ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
+
+ p = &aliases->aliases[alias->slot];
+ p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
+ p->npages = alias->memory_size >> PAGE_SHIFT;
+ p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+ p->flags &= ~(KVM_ALIAS_INVALID);
+
+ for (n = KVM_ALIAS_SLOTS; n > 0; --n)
+ if (aliases->aliases[n - 1].npages)
+ break;
+ aliases->naliases = n;
+
+ old_aliases = kvm->arch.aliases;
+ rcu_assign_pointer(kvm->arch.aliases, aliases);
+ kvm_synchronize_srcu_expedited(&kvm->srcu);
+ kfree(old_aliases);
+ r = 0;
+
+out_unlock:
+ mutex_unlock(&kvm->slots_lock);
+out:
+ return r;
}
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -3150,18 +2569,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic_irqchip(kvm)->lock);
+ raw_spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[0],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ raw_spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic_irqchip(kvm)->lock);
+ raw_spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[1],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ raw_spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_IOAPIC:
r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -3204,7 +2623,6 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
sizeof(ps->channels));
ps->flags = kvm->arch.vpit->pit_state.flags;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- memset(&ps->reserved, 0, sizeof(ps->reserved));
return r;
}
@@ -3246,6 +2664,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
unsigned long n;
unsigned long is_dirty = 0;
+ unsigned long *dirty_bitmap = NULL;
mutex_lock(&kvm->slots_lock);
@@ -3260,47 +2679,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
n = kvm_dirty_bitmap_bytes(memslot);
+ r = -ENOMEM;
+ dirty_bitmap = vmalloc(n);
+ if (!dirty_bitmap)
+ goto out;
+ memset(dirty_bitmap, 0, n);
+
for (i = 0; !is_dirty && i < n/sizeof(long); i++)
is_dirty = memslot->dirty_bitmap[i];
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
struct kvm_memslots *slots, *old_slots;
- unsigned long *dirty_bitmap;
- dirty_bitmap = memslot->dirty_bitmap_head;
- if (memslot->dirty_bitmap == dirty_bitmap)
- dirty_bitmap += n / sizeof(long);
- memset(dirty_bitmap, 0, n);
+ spin_lock(&kvm->mmu_lock);
+ kvm_mmu_slot_remove_write_access(kvm, log->slot);
+ spin_unlock(&kvm->mmu_lock);
- r = -ENOMEM;
slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!slots)
- goto out;
+ goto out_free;
+
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
- slots->generation++;
old_slots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
kvm_synchronize_srcu_expedited(&kvm->srcu);
dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
kfree(old_slots);
-
- spin_lock(&kvm->mmu_lock);
- kvm_mmu_slot_remove_write_access(kvm, log->slot);
- spin_unlock(&kvm->mmu_lock);
-
- r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
- goto out;
- } else {
- r = -EFAULT;
- if (clear_user(log->dirty_bitmap, n))
- goto out;
}
r = 0;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
+ r = -EFAULT;
+out_free:
+ vfree(dirty_bitmap);
out:
mutex_unlock(&kvm->slots_lock);
return r;
@@ -3320,6 +2734,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
union {
struct kvm_pit_state ps;
struct kvm_pit_state2 ps2;
+ struct kvm_memory_alias alias;
struct kvm_pit_config pit_config;
} u;
@@ -3340,6 +2755,22 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
break;
}
+ case KVM_SET_MEMORY_REGION: {
+ struct kvm_memory_region kvm_mem;
+ struct kvm_userspace_memory_region kvm_userspace_mem;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
+ goto out;
+ kvm_userspace_mem.slot = kvm_mem.slot;
+ kvm_userspace_mem.flags = kvm_mem.flags;
+ kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
+ kvm_userspace_mem.memory_size = kvm_mem.memory_size;
+ r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ if (r)
+ goto out;
+ break;
+ }
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
if (r)
@@ -3348,6 +2779,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
+ case KVM_SET_MEMORY_ALIAS:
+ r = -EFAULT;
+ if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
+ goto out;
+ r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
+ if (r)
+ goto out;
+ break;
case KVM_CREATE_IRQCHIP: {
struct kvm_pic *vpic;
@@ -3360,10 +2799,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (vpic) {
r = kvm_ioapic_init(kvm);
if (r) {
- mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
&vpic->dev);
- mutex_unlock(&kvm->slots_lock);
kfree(vpic);
goto create_irqchip_unlock;
}
@@ -3374,12 +2811,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
smp_wmb();
r = kvm_setup_default_irq_routing(kvm);
if (r) {
- mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->irq_lock);
kvm_ioapic_destroy(kvm);
kvm_destroy_pic(kvm);
mutex_unlock(&kvm->irq_lock);
- mutex_unlock(&kvm->slots_lock);
}
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -3412,13 +2847,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&irq_event, argp, sizeof irq_event))
goto out;
- r = -ENXIO;
if (irqchip_in_kernel(kvm)) {
__s32 status;
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
irq_event.status = status;
if (copy_to_user(argp, &irq_event,
sizeof irq_event))
@@ -3555,6 +2988,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
break;
}
case KVM_SET_CLOCK: {
+ struct timespec now;
struct kvm_clock_data user_ns;
u64 now_ns;
s64 delta;
@@ -3568,23 +3002,21 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
r = 0;
- local_irq_disable();
- now_ns = get_kernel_ns();
+ ktime_get_ts(&now);
+ now_ns = timespec_to_ns(&now);
delta = user_ns.clock - now_ns;
- local_irq_enable();
kvm->arch.kvmclock_offset = delta;
break;
}
case KVM_GET_CLOCK: {
+ struct timespec now;
struct kvm_clock_data user_ns;
u64 now_ns;
- local_irq_disable();
- now_ns = get_kernel_ns();
+ ktime_get_ts(&now);
+ now_ns = timespec_to_ns(&now);
user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
- local_irq_enable();
user_ns.flags = 0;
- memset(&user_ns.pad, 0, sizeof(user_ns.pad));
r = -EFAULT;
if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
@@ -3635,86 +3067,52 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
}
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-void kvm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
-static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
-{
- return gpa;
-}
-
-static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
-{
- gpa_t t_gpa;
- struct x86_exception exception;
-
- BUG_ON(!mmu_is_nested(vcpu));
-
- /* NPT walks are always user-walks */
- access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
-
- return t_gpa;
-}
-
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
}
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
}
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
}
/* uses this to access any guest's mapped memory without checking CPL */
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 access,
- struct x86_exception *exception)
+ u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
- exception);
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
+ if (gpa == UNMAPPED_GVA) {
+ r = X86EMUL_PROPAGATE_FAULT;
+ goto out;
+ }
ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
if (ret < 0) {
- r = X86EMUL_IO_NEEDED;
+ r = X86EMUL_UNHANDLEABLE;
goto out;
}
@@ -3728,52 +3126,46 @@ out:
/* used for instruction fetching */
static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
- struct x86_exception *exception)
+ struct kvm_vcpu *vcpu, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
- access | PFERR_FETCH_MASK,
- exception);
+ access | PFERR_FETCH_MASK, error);
}
static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
- struct x86_exception *exception)
+ struct kvm_vcpu *vcpu, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
- exception);
+ error);
}
static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
- struct x86_exception *exception)
+ struct kvm_vcpu *vcpu, u32 *error)
{
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
}
-static int kvm_write_guest_virt_system(gva_t addr, void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu,
- struct x86_exception *exception)
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
- PFERR_WRITE_MASK,
- exception);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
+ if (gpa == UNMAPPED_GVA) {
+ r = X86EMUL_PROPAGATE_FAULT;
+ goto out;
+ }
ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
if (ret < 0) {
- r = X86EMUL_IO_NEEDED;
+ r = X86EMUL_UNHANDLEABLE;
goto out;
}
@@ -3785,13 +3177,14 @@ out:
return r;
}
+
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
- struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ u32 error_code;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -3801,17 +3194,19 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
- if (gpa == UNMAPPED_GVA)
+ if (gpa == UNMAPPED_GVA) {
+ kvm_inject_page_fault(vcpu, addr, error_code);
return X86EMUL_PROPAGATE_FAULT;
+ }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
- == X86EMUL_CONTINUE)
+ if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
+ == X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
mmio:
@@ -3826,16 +3221,15 @@ mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
vcpu->mmio_needed = 1;
- vcpu->run->exit_reason = KVM_EXIT_MMIO;
- vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->run->mmio.len = vcpu->mmio_size = bytes;
- vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
+ vcpu->mmio_phys_addr = gpa;
+ vcpu->mmio_size = bytes;
+ vcpu->mmio_is_write = 0;
- return X86EMUL_IO_NEEDED;
+ return X86EMUL_UNHANDLEABLE;
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
+ const void *val, int bytes)
{
int ret;
@@ -3849,15 +3243,17 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
- struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ u32 error_code;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
- if (gpa == UNMAPPED_GVA)
+ if (gpa == UNMAPPED_GVA) {
+ kvm_inject_page_fault(vcpu, addr, error_code);
return X86EMUL_PROPAGATE_FAULT;
+ }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3875,185 +3271,72 @@ mmio:
return X86EMUL_CONTINUE;
vcpu->mmio_needed = 1;
- vcpu->run->exit_reason = KVM_EXIT_MMIO;
- vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->run->mmio.len = vcpu->mmio_size = bytes;
- vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
- memcpy(vcpu->run->mmio.data, val, bytes);
+ vcpu->mmio_phys_addr = gpa;
+ vcpu->mmio_size = bytes;
+ vcpu->mmio_is_write = 1;
+ memcpy(vcpu->mmio_data, val, bytes);
return X86EMUL_CONTINUE;
}
int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct x86_exception *exception,
- struct kvm_vcpu *vcpu)
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
int rc, now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, exception,
- vcpu);
+ rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, exception,
- vcpu);
+ return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
}
-
-#define CMPXCHG_TYPE(t, ptr, old, new) \
- (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
-
-#ifdef CONFIG_X86_64
-# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
-#else
-# define CMPXCHG64(ptr, old, new) \
- (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
-#endif
+EXPORT_SYMBOL_GPL(emulator_write_emulated);
static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
- struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
- gpa_t gpa;
- struct page *page;
- char *kaddr;
- bool exchanged;
-
- /* guests cmpxchg8b have to be emulated atomically */
- if (bytes > 8 || (bytes & (bytes - 1)))
- goto emul_write;
-
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
-
- if (gpa == UNMAPPED_GVA ||
- (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- goto emul_write;
-
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
- goto emul_write;
-
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
- goto emul_write;
- }
-
- kaddr = kmap_atomic(page, KM_USER0);
- kaddr += offset_in_page(gpa);
- switch (bytes) {
- case 1:
- exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
- break;
- case 2:
- exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
- break;
- case 4:
- exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
- break;
- case 8:
- exchanged = CMPXCHG64(kaddr, old, new);
- break;
- default:
- BUG();
- }
- kunmap_atomic(kaddr, KM_USER0);
- kvm_release_page_dirty(page);
-
- if (!exchanged)
- return X86EMUL_CMPXCHG_FAILED;
-
- kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
-
- return X86EMUL_CONTINUE;
-
-emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+#ifndef CONFIG_X86_64
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes == 8) {
+ gpa_t gpa;
+ struct page *page;
+ char *kaddr;
+ u64 val;
- return emulator_write_emulated(addr, new, bytes, exception, vcpu);
-}
-
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-{
- /* TODO: String I/O for in kernel device */
- int r;
-
- if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
- else
- r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
- vcpu->arch.pio.port, vcpu->arch.pio.size,
- pd);
- return r;
-}
-
-
-static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
- unsigned int count, struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.pio.count)
- goto data_avail;
-
- trace_kvm_pio(0, port, size, count);
-
- vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = 1;
- vcpu->arch.pio.count = count;
- vcpu->arch.pio.size = size;
-
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- data_avail:
- memcpy(val, vcpu->arch.pio_data, size * count);
- vcpu->arch.pio.count = 0;
- return 1;
- }
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = KVM_EXIT_IO_IN;
- vcpu->run->io.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = count;
- vcpu->run->io.port = port;
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
- return 0;
-}
+ if (gpa == UNMAPPED_GVA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
-static int emulator_pio_out_emulated(int size, unsigned short port,
- const void *val, unsigned int count,
- struct kvm_vcpu *vcpu)
-{
- trace_kvm_pio(1, port, size, count);
+ if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ goto emul_write;
- vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = 0;
- vcpu->arch.pio.count = count;
- vcpu->arch.pio.size = size;
+ val = *(u64 *)new;
- memcpy(vcpu->arch.pio_data, val, size * count);
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- vcpu->arch.pio.count = 0;
- return 1;
+ kaddr = kmap_atomic(page, KM_USER0);
+ set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
+ kunmap_atomic(kaddr, KM_USER0);
+ kvm_release_page_dirty(page);
}
+emul_write:
+#endif
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = KVM_EXIT_IO_OUT;
- vcpu->run->io.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = count;
- vcpu->run->io.port = port;
-
- return 0;
+ return emulator_write_emulated(addr, new, bytes, vcpu);
}
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
@@ -4067,25 +3350,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
return X86EMUL_CONTINUE;
}
-int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
-{
- if (!need_emulate_wbinvd(vcpu))
- return X86EMUL_CONTINUE;
-
- if (kvm_x86_ops->has_wbinvd_exit()) {
- int cpu = get_cpu();
-
- cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
- wbinvd_ipi, NULL, 1);
- put_cpu();
- cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
- } else
- wbinvd();
- return X86EMUL_CONTINUE;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
-
int emulate_clts(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
@@ -4093,194 +3357,42 @@ int emulate_clts(struct kvm_vcpu *vcpu)
return X86EMUL_CONTINUE;
}
-int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
-{
- return _kvm_get_dr(vcpu, dr, dest);
-}
-
-int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
-{
-
- return __kvm_set_dr(vcpu, dr, value);
-}
-
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
- return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
-static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
-{
- unsigned long value;
-
- switch (cr) {
- case 0:
- value = kvm_read_cr0(vcpu);
- break;
- case 2:
- value = vcpu->arch.cr2;
- break;
- case 3:
- value = kvm_read_cr3(vcpu);
- break;
- case 4:
- value = kvm_read_cr4(vcpu);
- break;
- case 8:
- value = kvm_get_cr8(vcpu);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- return 0;
- }
-
- return value;
-}
-
-static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
-{
- int res = 0;
-
- switch (cr) {
- case 0:
- res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
- break;
- case 2:
- vcpu->arch.cr2 = val;
- break;
- case 3:
- res = kvm_set_cr3(vcpu, val);
- break;
- case 4:
- res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
- break;
- case 8:
- res = kvm_set_cr8(vcpu, val);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- res = -1;
- }
-
- return res;
-}
-
-static int emulator_get_cpl(struct kvm_vcpu *vcpu)
-{
- return kvm_x86_ops->get_cpl(vcpu);
-}
-
-static void emulator_get_gdt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops->get_gdt(vcpu, dt);
-}
-
-static void emulator_get_idt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops->get_idt(vcpu, dt);
-}
-
-static unsigned long emulator_get_cached_segment_base(int seg,
- struct kvm_vcpu *vcpu)
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- return get_segment_base(vcpu, seg);
+ return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
}
-static bool emulator_get_cached_descriptor(struct kvm_desc_struct *desc, int seg,
- struct kvm_vcpu *vcpu)
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
{
- struct kvm_segment var;
-
- kvm_get_segment(vcpu, &var, seg);
-
- if (var.unusable)
- return false;
+ unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
- if (var.g)
- var.limit >>= 12;
- kvm_set_desc_limit(desc, var.limit);
- kvm_set_desc_base(desc, (unsigned long)var.base);
- desc->type = var.type;
- desc->s = var.s;
- desc->dpl = var.dpl;
- desc->p = var.present;
- desc->avl = var.avl;
- desc->l = var.l;
- desc->d = var.db;
- desc->g = var.g;
-
- return true;
+ return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
}
-static void emulator_set_cached_descriptor(struct kvm_desc_struct *desc, int seg,
- struct kvm_vcpu *vcpu)
+void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
{
- struct kvm_segment var;
-
- /* needed to preserve selector */
- kvm_get_segment(vcpu, &var, seg);
-
- var.base = kvm_get_desc_base(desc);
- var.limit = kvm_get_desc_limit(desc);
- if (desc->g)
- var.limit = (var.limit << 12) | 0xfff;
- var.type = desc->type;
- var.present = desc->p;
- var.dpl = desc->dpl;
- var.db = desc->d;
- var.s = desc->s;
- var.l = desc->l;
- var.g = desc->g;
- var.avl = desc->avl;
- var.present = desc->p;
- var.unusable = !var.present;
- var.padding = 0;
-
- kvm_set_segment(vcpu, &var, seg);
- return;
-}
+ u8 opcodes[4];
+ unsigned long rip = kvm_rip_read(vcpu);
+ unsigned long rip_linear;
-static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
-{
- struct kvm_segment kvm_seg;
+ if (!printk_ratelimit())
+ return;
- kvm_get_segment(vcpu, &kvm_seg, seg);
- return kvm_seg.selector;
-}
+ rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-static void emulator_set_segment_selector(u16 sel, int seg,
- struct kvm_vcpu *vcpu)
-{
- struct kvm_segment kvm_seg;
+ kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
- kvm_get_segment(vcpu, &kvm_seg, seg);
- kvm_seg.selector = sel;
- kvm_set_segment(vcpu, &kvm_seg, seg);
+ printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
+ context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
}
+EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
static struct x86_emulate_ops emulate_ops = {
.read_std = kvm_read_guest_virt_system,
- .write_std = kvm_write_guest_virt_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
- .pio_in_emulated = emulator_pio_in_emulated,
- .pio_out_emulated = emulator_pio_out_emulated,
- .get_cached_descriptor = emulator_get_cached_descriptor,
- .set_cached_descriptor = emulator_set_cached_descriptor,
- .get_segment_selector = emulator_get_segment_selector,
- .set_segment_selector = emulator_set_segment_selector,
- .get_cached_segment_base = emulator_get_cached_segment_base,
- .get_gdt = emulator_get_gdt,
- .get_idt = emulator_get_idt,
- .get_cr = emulator_get_cr,
- .set_cr = emulator_set_cr,
- .cpl = emulator_get_cpl,
- .get_dr = emulator_get_dr,
- .set_dr = emulator_set_dr,
- .set_msr = kvm_set_msr,
- .get_msr = kvm_get_msr,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4291,134 +3403,14 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
vcpu->arch.regs_dirty = ~0;
}
-static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+int emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2,
+ u16 error_code,
+ int emulation_type)
{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
- /*
- * an sti; sti; sequence only disable interrupts for the first
- * instruction. So, if the last instruction, be it emulated or
- * not, left the system with the INT_STI flag enabled, it
- * means that the last instruction is an sti. We should not
- * leave the flag on in this case. The same goes for mov ss
- */
- if (!(int_shadow & mask))
- kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
-}
-
-static void inject_emulated_exception(struct kvm_vcpu *vcpu)
-{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- if (ctxt->exception.vector == PF_VECTOR)
- kvm_propagate_fault(vcpu, &ctxt->exception);
- else if (ctxt->exception.error_code_valid)
- kvm_queue_exception_e(vcpu, ctxt->exception.vector,
- ctxt->exception.error_code);
- else
- kvm_queue_exception(vcpu, ctxt->exception.vector);
-}
-
-static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
-{
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
- int cs_db, cs_l;
-
- cache_all_regs(vcpu);
-
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
- vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
- vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
- vcpu->arch.emulate_ctxt.mode =
- (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
- (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_VM86 : cs_l
- ? X86EMUL_MODE_PROT64 : cs_db
- ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- memset(c, 0, sizeof(struct decode_cache));
- memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
-}
-
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
-{
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
- int ret;
-
- init_emulate_ctxt(vcpu);
-
- vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
- vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
- vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
- ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
-
- if (ret != X86EMUL_CONTINUE)
- return EMULATE_FAIL;
-
- vcpu->arch.emulate_ctxt.eip = c->eip;
- memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
- if (irq == NMI_VECTOR)
- vcpu->arch.nmi_pending = false;
- else
- vcpu->arch.interrupt.pending = false;
-
- return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
-
-static int handle_emulation_failure(struct kvm_vcpu *vcpu)
-{
- int r = EMULATE_DONE;
-
- ++vcpu->stat.insn_emulation_fail;
- trace_kvm_emulate_insn_failed(vcpu);
- if (!is_guest_mode(vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- r = EMULATE_FAIL;
- }
- kvm_queue_exception(vcpu, UD_VECTOR);
-
- return r;
-}
-
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
-
- if (tdp_enabled)
- return false;
-
- /*
- * if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-entetr the
- * guest to let CPU execute the instruction.
- */
- if (kvm_mmu_unprotect_page_virt(vcpu, gva))
- return true;
-
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
-
- if (gpa == UNMAPPED_GVA)
- return true; /* let cpu generate fault */
-
- if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
- return true;
-
- return false;
-}
-
-int x86_emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2,
- int emulation_type,
- void *insn,
- int insn_len)
-{
- int r;
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ int r, shadow_mask;
+ struct decode_cache *c;
+ struct kvm_run *run = vcpu->run;
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
@@ -4430,20 +3422,27 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
*/
cache_all_regs(vcpu);
+ vcpu->mmio_is_write = 0;
+ vcpu->arch.pio.string = 0;
+
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
- init_emulate_ctxt(vcpu);
- vcpu->arch.emulate_ctxt.interruptibility = 0;
- vcpu->arch.emulate_ctxt.have_exception = false;
- vcpu->arch.emulate_ctxt.perm_ok = false;
+ int cs_db, cs_l;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
- if (r == X86EMUL_PROPAGATE_FAULT)
- goto done;
+ vcpu->arch.emulate_ctxt.vcpu = vcpu;
+ vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.mode =
+ (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+ ? X86EMUL_MODE_VM86 : cs_l
+ ? X86EMUL_MODE_PROT64 : cs_db
+ ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- trace_kvm_emulate_insn_start(vcpu);
+ r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
/* Only allow emulation of specific instructions on #UD
* (namely VMMCALL, sysenter, sysexit, syscall)*/
+ c = &vcpu->arch.emulate_ctxt.decode;
if (emulation_type & EMULTYPE_TRAP_UD) {
if (!c->twobyte)
return EMULATE_FAIL;
@@ -4471,11 +3470,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
++vcpu->stat.insn_emulation;
if (r) {
- if (reexecute_instruction(vcpu, cr2))
+ ++vcpu->stat.insn_emulation_fail;
+ if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
return EMULATE_DONE;
- if (emulation_type & EMULTYPE_SKIP)
- return EMULATE_FAIL;
- return handle_emulation_failure(vcpu);
+ return EMULATE_FAIL;
}
}
@@ -4484,74 +3482,245 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
}
- /* this is needed for vmware backdor interface to work since it
- changes registers values during IO operation */
- memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+ r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+ shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+ if (r == 0)
+ kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+
+ if (vcpu->arch.pio.string)
+ return EMULATE_DO_MMIO;
-restart:
- r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
+ if ((r || vcpu->mmio_is_write) && run) {
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = vcpu->mmio_phys_addr;
+ memcpy(run->mmio.data, vcpu->mmio_data, 8);
+ run->mmio.len = vcpu->mmio_size;
+ run->mmio.is_write = vcpu->mmio_is_write;
+ }
- if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2))
+ if (r) {
+ if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
return EMULATE_DONE;
+ if (!vcpu->mmio_needed) {
+ kvm_report_emulation_failure(vcpu, "mmio");
+ return EMULATE_FAIL;
+ }
+ return EMULATE_DO_MMIO;
+ }
+
+ kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+ if (vcpu->mmio_is_write) {
+ vcpu->mmio_needed = 0;
+ return EMULATE_DO_MMIO;
+ }
+
+ return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(emulate_instruction);
+
+static int pio_copy_data(struct kvm_vcpu *vcpu)
+{
+ void *p = vcpu->arch.pio_data;
+ gva_t q = vcpu->arch.pio.guest_gva;
+ unsigned bytes;
+ int ret;
+ u32 error_code;
- return handle_emulation_failure(vcpu);
- }
-
-done:
- if (vcpu->arch.emulate_ctxt.have_exception) {
- inject_emulated_exception(vcpu);
- r = EMULATE_DONE;
- } else if (vcpu->arch.pio.count) {
- if (!vcpu->arch.pio.in)
- vcpu->arch.pio.count = 0;
- r = EMULATE_DO_MMIO;
- } else if (vcpu->mmio_needed) {
- if (vcpu->mmio_is_write)
- vcpu->mmio_needed = 0;
- r = EMULATE_DO_MMIO;
- } else if (r == EMULATION_RESTART)
- goto restart;
+ bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
+ if (vcpu->arch.pio.in)
+ ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
else
- r = EMULATE_DONE;
+ ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
+
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(vcpu, q, error_code);
+
+ return ret;
+}
+
+int complete_pio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pio_request *io = &vcpu->arch.pio;
+ long delta;
+ int r;
+ unsigned long val;
- toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+ if (!io->string) {
+ if (io->in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(&val, vcpu->arch.pio_data, io->size);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ }
+ } else {
+ if (io->in) {
+ r = pio_copy_data(vcpu);
+ if (r)
+ goto out;
+ }
+ delta = 1;
+ if (io->rep) {
+ delta *= io->cur_count;
+ /*
+ * The size of the register should really depend on
+ * current address size.
+ */
+ val = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ val -= delta;
+ kvm_register_write(vcpu, VCPU_REGS_RCX, val);
+ }
+ if (io->down)
+ delta = -delta;
+ delta *= io->size;
+ if (io->in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ val += delta;
+ kvm_register_write(vcpu, VCPU_REGS_RDI, val);
+ } else {
+ val = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ val += delta;
+ kvm_register_write(vcpu, VCPU_REGS_RSI, val);
+ }
+ }
+out:
+ io->count -= io->cur_count;
+ io->cur_count = 0;
+
+ return 0;
+}
+
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+ /* TODO: String I/O for in kernel device */
+ int r;
+
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
return r;
}
-EXPORT_SYMBOL_GPL(x86_emulate_instruction);
-int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
+static int pio_string_write(struct kvm_vcpu *vcpu)
{
- unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
- /* do not return to emulator after return from userspace */
- vcpu->arch.pio.count = 0;
- return ret;
+ struct kvm_pio_request *io = &vcpu->arch.pio;
+ void *pd = vcpu->arch.pio_data;
+ int i, r = 0;
+
+ for (i = 0; i < io->cur_count; i++) {
+ if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ io->port, io->size, pd)) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ pd += io->size;
+ }
+ return r;
}
-EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
-static void tsc_bad(void *info)
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
{
- kvm___this_cpu_write(cpu_tsc_khz, 0);
+ unsigned long val;
+
+ trace_kvm_pio(!in, port, size, 1);
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = vcpu->arch.pio.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
+ vcpu->run->io.port = vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = in;
+ vcpu->arch.pio.string = 0;
+ vcpu->arch.pio.down = 0;
+ vcpu->arch.pio.rep = 0;
+
+ if (!vcpu->arch.pio.in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(vcpu->arch.pio_data, &val, 4);
+ }
+
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ complete_pio(vcpu);
+ return 1;
+ }
+ return 0;
}
+EXPORT_SYMBOL_GPL(kvm_emulate_pio);
-static void tsc_khz_changed(void *data)
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
+ int size, unsigned long count, int down,
+ gva_t address, int rep, unsigned port)
{
- struct cpufreq_freqs *freq = data;
- unsigned long khz = 0;
+ unsigned now, in_page;
+ int ret = 0;
+
+ trace_kvm_pio(!in, port, size, count);
- if (data)
- khz = freq->new;
- else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- khz = cpufreq_quick_get(raw_smp_processor_id());
- if (!khz)
- khz = tsc_khz;
- kvm___this_cpu_write(cpu_tsc_khz, khz);
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = vcpu->arch.pio.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+ vcpu->run->io.port = vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = in;
+ vcpu->arch.pio.string = 1;
+ vcpu->arch.pio.down = down;
+ vcpu->arch.pio.rep = rep;
+
+ if (!count) {
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ if (!down)
+ in_page = PAGE_SIZE - offset_in_page(address);
+ else
+ in_page = offset_in_page(address) + size;
+ now = min(count, (unsigned long)in_page / size);
+ if (!now)
+ now = 1;
+ if (down) {
+ /*
+ * String I/O in reverse. Yuck. Kill the guest, fix later.
+ */
+ pr_unimpl(vcpu, "guest string pio down\n");
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->run->io.count = now;
+ vcpu->arch.pio.cur_count = now;
+
+ if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+ vcpu->arch.pio.guest_gva = address;
+
+ if (!vcpu->arch.pio.in) {
+ /* string PIO write */
+ ret = pio_copy_data(vcpu);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ return 1;
+ if (ret == 0 && !pio_string_write(vcpu)) {
+ complete_pio(vcpu);
+ if (vcpu->arch.pio.count == 0)
+ ret = 1;
+ }
+ }
+ /* no string PIO read support yet */
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+
+static void bounce_off(void *info)
+{
+ /* nothing */
}
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4562,60 +3731,21 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
- /*
- * We allow guests to temporarily run on slowing clocks,
- * provided we notify them after, or to run on accelerating
- * clocks, provided we notify them before. Thus time never
- * goes backwards.
- *
- * However, we have a problem. We can't atomically update
- * the frequency of a given CPU from this function; it is
- * merely a notifier, which can be called from any CPU.
- * Changing the TSC frequency at arbitrary points in time
- * requires a recomputation of local variables related to
- * the TSC for each VCPU. We must flag these local variables
- * to be updated and be sure the update takes place with the
- * new frequency before any guests proceed.
- *
- * Unfortunately, the combination of hotplug CPU and frequency
- * change creates an intractable locking scenario; the order
- * of when these callouts happen is undefined with respect to
- * CPU hotplug, and they can race with each other. As such,
- * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
- * undefined; you can actually have a CPU frequency change take
- * place in between the computation of X and the setting of the
- * variable. To protect against this problem, all updates of
- * the per_cpu tsc_khz variable are done in an interrupt
- * protected IPI, and all callers wishing to update the value
- * must wait for a synchronous IPI to complete (which is trivial
- * if the caller is on the CPU already). This establishes the
- * necessary total order on variable updates.
- *
- * Note that because a guest time update may take place
- * anytime after the setting of the VCPU's request bit, the
- * correct TSC value must be set before the request. However,
- * to ensure the update actually makes it to any guest which
- * starts running in hardware virtualization between the set
- * and the acquisition of the spinlock, we must also ping the
- * CPU after setting the request bit.
- *
- */
-
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
-
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (!kvm_request_guest_time_update(vcpu))
+ continue;
if (vcpu->cpu != smp_processor_id())
- send_ipi = 1;
+ send_ipi++;
}
}
spin_unlock(&kvm_lock);
@@ -4633,106 +3763,34 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
* guest context is entered kvmclock will be updated,
* so the guest will not see stale values.
*/
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
}
return 0;
}
static struct notifier_block kvmclock_cpufreq_notifier_block = {
- .notifier_call = kvmclock_cpufreq_notifier
-};
-
-static int kvmclock_cpu_notifier(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, tsc_bad, NULL, 1);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block kvmclock_cpu_notifier_block = {
- .notifier_call = kvmclock_cpu_notifier,
- .priority = -INT_MAX
+ .notifier_call = kvmclock_cpufreq_notifier
};
static void kvm_timer_init(void)
{
int cpu;
- max_tsc_khz = tsc_khz;
- register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-#ifdef CONFIG_CPU_FREQ
- struct cpufreq_policy policy;
- memset(&policy, 0, sizeof(policy));
- cpu = get_cpu();
- cpufreq_get_policy(&policy, cpu);
- if (policy.cpuinfo.max_freq)
- max_tsc_khz = policy.cpuinfo.max_freq;
- put_cpu();
-#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
+ for_each_online_cpu(cpu) {
+ unsigned long khz = cpufreq_get(cpu);
+ if (!khz)
+ khz = tsc_khz;
+ per_cpu(cpu_tsc_khz, cpu) = khz;
+ }
+ } else {
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
}
- pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
- for_each_online_cpu(cpu)
- smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
-}
-
-static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
-
-static int kvm_is_in_guest(void)
-{
- return percpu_read(current_vcpu) != NULL;
}
-static int kvm_is_user_mode(void)
-{
- int user_mode = 3;
-
- if (percpu_read(current_vcpu))
- user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
-
- return user_mode != 0;
-}
-
-static unsigned long kvm_get_guest_ip(void)
-{
- unsigned long ip = 0;
-
- if (percpu_read(current_vcpu))
- ip = kvm_rip_read(percpu_read(current_vcpu));
-
- return ip;
-}
-
-static struct perf_guest_info_callbacks kvm_guest_cbs = {
- .is_in_guest = kvm_is_in_guest,
- .is_user_mode = kvm_is_user_mode,
- .get_guest_ip = kvm_get_guest_ip,
-};
-
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
-{
- percpu_write(current_vcpu, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
-
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
-{
- percpu_write(current_vcpu, NULL);
-}
-EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
-
int kvm_arch_init(void *opaque)
{
int r;
@@ -4751,10 +3809,6 @@ int kvm_arch_init(void *opaque)
}
if (ops->disabled_by_bios()) {
printk(KERN_ERR "kvm: disabled by bios\n");
-#ifndef KVM_TBOOT_ENABLED_WORKS
-
- printk(KERN_ERR "kvm: if TXT is enabled in the BIOS, disable it\n");
-#endif
r = -EOPNOTSUPP;
goto out;
}
@@ -4765,20 +3819,14 @@ int kvm_arch_init(void *opaque)
kvm_init_msr_list();
- kvm_xstate_size_init();
-
kvm_x86_ops = ops;
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+ kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
kvm_timer_init();
- perf_register_guest_info_callbacks(&kvm_guest_cbs);
-
- if (kvm_cpu_has_xsave)
- host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-
return 0;
out:
@@ -4787,12 +3835,9 @@ out:
void kvm_arch_exit(void)
{
- perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
-
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
- unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
}
@@ -4942,23 +3987,88 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
+ return emulator_write_emulated(rip, instruction, 3, vcpu);
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
}
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
- struct kvm_desc_ptr dt = { limit, base };
+ struct descriptor_table dt = { limit, base };
kvm_x86_ops->set_gdt(vcpu, &dt);
}
void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
- struct kvm_desc_ptr dt = { limit, base };
+ struct descriptor_table dt = { limit, base };
kvm_x86_ops->set_idt(vcpu, &dt);
}
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+ unsigned long *rflags)
+{
+ kvm_lmsw(vcpu, msw);
+ *rflags = kvm_get_rflags(vcpu);
+}
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
+{
+ unsigned long value;
+
+ switch (cr) {
+ case 0:
+ value = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ value = vcpu->arch.cr2;
+ break;
+ case 3:
+ value = vcpu->arch.cr3;
+ break;
+ case 4:
+ value = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ value = kvm_get_cr8(vcpu);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ return 0;
+ }
+
+ return value;
+}
+
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
+ unsigned long *rflags)
+{
+ switch (cr) {
+ case 0:
+ kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ *rflags = kvm_get_rflags(vcpu);
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ break;
+ case 8:
+ kvm_set_cr8(vcpu, val & 0xfUL);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ }
+}
+
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
{
struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -5022,13 +4132,9 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
- if (!best || best->eax < 0x80000008)
- goto not_found;
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
if (best)
return best->eax & 0xff;
-not_found:
return 36;
}
@@ -5109,10 +4215,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
if (!apic || !apic->vapic_addr)
return;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
kvm_release_page_dirty(apic->vapic_page);
mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -5142,13 +4248,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
{
/* try to reinject previous events if any */
if (vcpu->arch.exception.pending) {
- trace_kvm_inj_exception(vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code);
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code,
- vcpu->arch.exception.reinject);
+ vcpu->arch.exception.error_code);
return;
}
@@ -5178,84 +4280,44 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
}
}
-static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
-{
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
- !vcpu->guest_xcr0_loaded) {
- /* kvm_set_xcr() also depends on this */
- xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- vcpu->guest_xcr0_loaded = 1;
- }
-}
-
-static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
-{
- if (vcpu->guest_xcr0_loaded) {
- if (vcpu->arch.xcr0 != host_xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- vcpu->guest_xcr0_loaded = 0;
- }
-}
-
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
- if (vcpu->requests) {
- if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+ if (vcpu->requests)
+ if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
kvm_mmu_unload(vcpu);
- if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ goto out;
+
+ if (vcpu->requests) {
+ if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
- if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
- r = kvm_guest_time_update(vcpu);
- if (unlikely(r))
- goto out;
- }
- if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
+ if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+ kvm_write_guest_time(vcpu);
+ if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
kvm_mmu_sync_roots(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);
- if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
+ if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
+ &vcpu->requests)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
- if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
r = 0;
goto out;
}
- if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
+ if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
- if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
- /* Page is swapped out. Do synthetic halt */
- vcpu->arch.apf.halted = true;
- r = 1;
- goto out;
- }
- }
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- goto out;
-
- if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
- inject_pending_event(vcpu);
-
- /* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
- kvm_x86_ops->enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
- kvm_x86_ops->enable_irq_window(vcpu);
-
- if (kvm_lapic_enabled(vcpu)) {
- update_cr8_intercept(vcpu);
- kvm_lapic_sync_to_vapic(vcpu);
- }
}
preempt_disable();
@@ -5263,25 +4325,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
- kvm_load_guest_xcr0(vcpu);
-
- atomic_set(&vcpu->guest_mode, 1);
- smp_wmb();
local_irq_disable();
- if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
- || need_resched() || signal_pending(current)) {
- atomic_set(&vcpu->guest_mode, 0);
- smp_wmb();
+ clear_bit(KVM_REQ_KICK, &vcpu->requests);
+ smp_mb__after_clear_bit();
+
+ if (vcpu->requests || need_resched() || signal_pending(current)) {
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
preempt_enable();
- kvm_x86_ops->cancel_injection(vcpu);
r = 1;
goto out;
}
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ inject_pending_event(vcpu);
+
+ /* enable NMI/IRQ window open exits if needed */
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
+
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_guest_enter();
@@ -5306,10 +4377,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
-
- atomic_set(&vcpu->guest_mode, 0);
- smp_wmb();
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
++vcpu->stat.exits;
@@ -5326,7 +4394,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_enable();
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
/*
* Profile KVM exit RIPs:
@@ -5360,26 +4428,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu);
vapic_enter(vcpu);
r = 1;
while (r > 0) {
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
- !vcpu->arch.apf.halted)
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
r = vcpu_enter_guest(vcpu);
else {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
+ vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu);
+ if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
{
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
- vcpu->arch.apf.halted = false;
break;
case KVM_MP_STATE_SIPI_RECEIVED:
default:
@@ -5401,22 +4467,20 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits;
}
-
- kvm_check_async_pf_completion(vcpu);
-
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
}
if (need_resched()) {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_resched(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu);
}
}
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ post_kvm_run_save(vcpu);
vapic_exit(vcpu);
@@ -5428,8 +4492,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
sigset_t sigsaved;
- if (!tsk_used_math(current) && kvm_init_fpu(current))
- return -ENOMEM;
+ vcpu_load(vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -5442,23 +4505,29 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
/* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm)) {
- if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
- r = -EINVAL;
+ if (!irqchip_in_kernel(vcpu->kvm))
+ kvm_set_cr8(vcpu, kvm_run->cr8);
+
+ if (vcpu->arch.pio.cur_count) {
+ vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
+ r = complete_pio(vcpu);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r)
goto out;
- }
}
+ if (vcpu->mmio_needed) {
+ memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+ vcpu->mmio_read_completed = 1;
+ vcpu->mmio_needed = 0;
- if (vcpu->arch.pio.count || vcpu->mmio_needed) {
- if (vcpu->mmio_needed) {
- memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
- vcpu->mmio_read_completed = 1;
- vcpu->mmio_needed = 0;
- }
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r != EMULATE_DONE) {
+ vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
+ r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
+ EMULTYPE_NO_DECODE);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r == EMULATE_DO_MMIO) {
+ /*
+ * Read-modify-write. Back to userspace.
+ */
r = 0;
goto out;
}
@@ -5470,15 +4539,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
r = __vcpu_run(vcpu);
out:
- post_kvm_run_save(vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ vcpu_put(vcpu);
return r;
}
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ vcpu_load(vcpu);
+
regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -5501,11 +4572,15 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_get_rflags(vcpu);
+ vcpu_put(vcpu);
+
return 0;
}
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ vcpu_load(vcpu);
+
kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -5530,11 +4605,17 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.exception.pending = false;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu_put(vcpu);
return 0;
}
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
struct kvm_segment cs;
@@ -5548,7 +4629,9 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
- struct kvm_desc_ptr dt;
+ struct descriptor_table dt;
+
+ vcpu_load(vcpu);
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -5561,15 +4644,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
kvm_x86_ops->get_idt(vcpu, &dt);
- sregs->idt.limit = dt.size;
- sregs->idt.base = dt.address;
+ sregs->idt.limit = dt.limit;
+ sregs->idt.base = dt.base;
kvm_x86_ops->get_gdt(vcpu, &dt);
- sregs->gdt.limit = dt.size;
- sregs->gdt.base = dt.address;
+ sregs->gdt.limit = dt.limit;
+ sregs->gdt.base = dt.base;
sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
- sregs->cr3 = kvm_read_cr3(vcpu);
+ sregs->cr3 = vcpu->arch.cr3;
sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
@@ -5581,44 +4664,586 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
+ vcpu_put(vcpu);
+
return 0;
}
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ vcpu_load(vcpu);
mp_state->mp_state = vcpu->arch.mp_state;
+ vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ vcpu_load(vcpu);
vcpu->arch.mp_state = mp_state->mp_state;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu_put(vcpu);
return 0;
}
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
- bool has_error_code, u32 error_code)
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+static void seg_desct_to_kvm_desct(struct kvm_desc_struct *seg_desc, u16 selector,
+ struct kvm_segment *kvm_desct)
+{
+ kvm_desct->base = kvm_get_desc_base(seg_desc);
+ kvm_desct->limit = kvm_get_desc_limit(seg_desc);
+ if (seg_desc->g) {
+ kvm_desct->limit <<= 12;
+ kvm_desct->limit |= 0xfff;
+ }
+ kvm_desct->selector = selector;
+ kvm_desct->type = seg_desc->type;
+ kvm_desct->present = seg_desc->p;
+ kvm_desct->dpl = seg_desc->dpl;
+ kvm_desct->db = seg_desc->d;
+ kvm_desct->s = seg_desc->s;
+ kvm_desct->l = seg_desc->l;
+ kvm_desct->g = seg_desc->g;
+ kvm_desct->avl = seg_desc->avl;
+ if (!selector)
+ kvm_desct->unusable = 1;
+ else
+ kvm_desct->unusable = 0;
+ kvm_desct->padding = 0;
+}
+
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
+ u16 selector,
+ struct descriptor_table *dtable)
+{
+ if (selector & 1 << 2) {
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
+
+ if (kvm_seg.unusable)
+ dtable->limit = 0;
+ else
+ dtable->limit = kvm_seg.limit;
+ dtable->base = kvm_seg.base;
+ }
+ else
+ kvm_x86_ops->get_gdt(vcpu, dtable);
+}
+
+/* allowed just for 8 bytes segments */
+static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+ struct kvm_desc_struct *seg_desc)
{
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ struct descriptor_table dtable;
+ u16 index = selector >> 3;
int ret;
+ u32 err;
+ gva_t addr;
+
+ get_segment_descriptor_dtable(vcpu, selector, &dtable);
+
+ if (dtable.limit < index * 8 + 7) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ addr = dtable.base + index * 8;
+ ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
+ vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(vcpu, addr, err);
- init_emulate_ctxt(vcpu);
+ return ret;
+}
- ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
- tss_selector, reason, has_error_code,
- error_code);
+/* allowed just for 8 bytes segments */
+static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+ struct kvm_desc_struct *seg_desc)
+{
+ struct descriptor_table dtable;
+ u16 index = selector >> 3;
+
+ get_segment_descriptor_dtable(vcpu, selector, &dtable);
+
+ if (dtable.limit < index * 8 + 7)
+ return 1;
+ return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
+}
+static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
+ struct kvm_desc_struct *seg_desc)
+{
+ u32 base_addr = kvm_get_desc_base(seg_desc);
+
+ return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
+}
+
+static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
+ struct kvm_desc_struct *seg_desc)
+{
+ u32 base_addr = kvm_get_desc_base(seg_desc);
+
+ return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
+}
+
+static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ return kvm_seg.selector;
+}
+
+static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
+{
+ struct kvm_segment segvar = {
+ .base = selector << 4,
+ .limit = 0xffff,
+ .selector = selector,
+ .type = 3,
+ .present = 1,
+ .dpl = 3,
+ .db = 0,
+ .s = 1,
+ .l = 0,
+ .g = 0,
+ .avl = 0,
+ .unusable = 0,
+ };
+ kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+ return X86EMUL_CONTINUE;
+}
+
+static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+{
+ return (seg != VCPU_SREG_LDTR) &&
+ (seg != VCPU_SREG_TR) &&
+ (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
+}
+
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
+{
+ struct kvm_segment kvm_seg;
+ struct kvm_desc_struct seg_desc;
+ u8 dpl, rpl, cpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ int ret;
+
+ if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
+ return kvm_load_realmode_segment(vcpu, selector, seg);
+
+ /* NULL selector is not valid for TR, CS and SS */
+ if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ && null_selector)
+ goto exception;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
if (ret)
- return EMULATE_FAIL;
+ return ret;
- memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- return EMULATE_DONE;
+ seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
+
+ if (null_selector) { /* for NULL selector skip all following checks */
+ kvm_seg.unusable = 1;
+ goto load;
+ }
+
+ err_code = selector & 0xfffc;
+ err_vec = GP_VECTOR;
+
+ /* can't load system descriptor into segment selecor */
+ if (seg <= VCPU_SREG_GS && !kvm_seg.s)
+ goto exception;
+
+ if (!kvm_seg.present) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ rpl = selector & 3;
+ dpl = kvm_seg.dpl;
+ cpl = kvm_x86_ops->get_cpl(vcpu);
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(kvm_seg.type & 8))
+ goto exception;
+
+ if (kvm_seg.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (kvm_seg.s || kvm_seg.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((kvm_seg.type & 0xa) == 0x8 ||
+ (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (!kvm_seg.unusable && kvm_seg.s) {
+ /* mark segment as accessed */
+ kvm_seg.type |= 1;
+ seg_desc.type |= 1;
+ save_guest_segment_descriptor(vcpu, selector, &seg_desc);
+ }
+load:
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+ return X86EMUL_CONTINUE;
+exception:
+ kvm_queue_exception_e(vcpu, err_vec, err_code);
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
+static void save_state_to_tss32(struct kvm_vcpu *vcpu,
+ struct tss_segment_32 *tss)
+{
+ tss->cr3 = vcpu->arch.cr3;
+ tss->eip = kvm_rip_read(vcpu);
+ tss->eflags = kvm_get_rflags(vcpu);
+ tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
+ tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+}
+
+static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
+{
+ struct kvm_segment kvm_seg;
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ kvm_seg.selector = sel;
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
+static int load_state_from_tss32(struct kvm_vcpu *vcpu,
+ struct tss_segment_32 *tss)
+{
+ kvm_set_cr3(vcpu, tss->cr3);
+
+ kvm_rip_write(vcpu, tss->eip);
+ kvm_set_rflags(vcpu, tss->eflags | 2);
+
+ kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+ kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
+ kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
+ return 1;
+ return 0;
+}
+
+static void save_state_to_tss16(struct kvm_vcpu *vcpu,
+ struct tss_segment_16 *tss)
+{
+ tss->ip = kvm_rip_read(vcpu);
+ tss->flag = kvm_get_rflags(vcpu);
+ tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+}
+
+static int load_state_from_tss16(struct kvm_vcpu *vcpu,
+ struct tss_segment_16 *tss)
+{
+ kvm_rip_write(vcpu, tss->ip);
+ kvm_set_rflags(vcpu, tss->flag | 2);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
+ return 1;
+ return 0;
+}
+
+static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
+ u16 old_tss_sel, u32 old_tss_base,
+ struct kvm_desc_struct *nseg_desc)
+{
+ struct tss_segment_16 tss_segment_16;
+ int ret = 0;
+
+ if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
+ sizeof tss_segment_16))
+ goto out;
+
+ save_state_to_tss16(vcpu, &tss_segment_16);
+
+ if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
+ sizeof tss_segment_16))
+ goto out;
+
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
+ &tss_segment_16, sizeof tss_segment_16))
+ goto out;
+
+ if (old_tss_sel != 0xffff) {
+ tss_segment_16.prev_task_link = old_tss_sel;
+
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr_write(vcpu, nseg_desc),
+ &tss_segment_16.prev_task_link,
+ sizeof tss_segment_16.prev_task_link))
+ goto out;
+ }
+
+ if (load_state_from_tss16(vcpu, &tss_segment_16))
+ goto out;
+
+ ret = 1;
+out:
+ return ret;
+}
+
+static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
+ u16 old_tss_sel, u32 old_tss_base,
+ struct kvm_desc_struct *nseg_desc)
+{
+ struct tss_segment_32 tss_segment_32;
+ int ret = 0;
+
+ if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
+ sizeof tss_segment_32))
+ goto out;
+
+ save_state_to_tss32(vcpu, &tss_segment_32);
+
+ if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
+ sizeof tss_segment_32))
+ goto out;
+
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
+ &tss_segment_32, sizeof tss_segment_32))
+ goto out;
+
+ if (old_tss_sel != 0xffff) {
+ tss_segment_32.prev_task_link = old_tss_sel;
+
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr_write(vcpu, nseg_desc),
+ &tss_segment_32.prev_task_link,
+ sizeof tss_segment_32.prev_task_link))
+ goto out;
+ }
+
+ if (load_state_from_tss32(vcpu, &tss_segment_32))
+ goto out;
+
+ ret = 1;
+out:
+ return ret;
+}
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
+{
+ struct kvm_segment tr_seg;
+ struct kvm_desc_struct cseg_desc;
+ struct kvm_desc_struct nseg_desc;
+ int ret = 0;
+ u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
+ u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
+ u32 desc_limit;
+
+ old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
+
+ /* FIXME: Handle errors. Failure to read either TSS or their
+ * descriptors should generate a pagefault.
+ */
+ if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
+ goto out;
+
+ if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
+ goto out;
+
+ if (reason != TASK_SWITCH_IRET) {
+ int cpl;
+
+ cpl = kvm_x86_ops->get_cpl(vcpu);
+ if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+ }
+
+ desc_limit = kvm_get_desc_limit(&nseg_desc);
+ if (!nseg_desc.p ||
+ ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
+ desc_limit < 0x2b)) {
+ kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
+ return 1;
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ cseg_desc.type &= ~(1 << 1); //clear the B flag
+ save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET) {
+ u32 eflags = kvm_get_rflags(vcpu);
+ kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+ }
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ if (nseg_desc.type & 8)
+ ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
+ else
+ ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
+ u32 eflags = kvm_get_rflags(vcpu);
+ kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
+ }
+
+ if (reason != TASK_SWITCH_IRET) {
+ nseg_desc.type |= (1 << 1);
+ save_guest_segment_descriptor(vcpu, tss_selector,
+ &nseg_desc);
+ }
+
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
+ seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
+ tr_seg.type = 11;
+ kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+out:
+ return ret;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5627,19 +5252,20 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
{
int mmu_reset_needed = 0;
int pending_vec, max_bits;
- struct kvm_desc_ptr dt;
+ struct descriptor_table dt;
- dt.size = sregs->idt.limit;
- dt.address = sregs->idt.base;
+ vcpu_load(vcpu);
+
+ dt.limit = sregs->idt.limit;
+ dt.base = sregs->idt.base;
kvm_x86_ops->set_idt(vcpu, &dt);
- dt.size = sregs->gdt.limit;
- dt.address = sregs->gdt.base;
+ dt.limit = sregs->gdt.limit;
+ dt.base = sregs->gdt.base;
kvm_x86_ops->set_gdt(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
- mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -5653,10 +5279,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- if (sregs->cr4 & X86_CR4_OSXSAVE)
- update_cpuid(vcpu);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ load_pdptrs(vcpu, vcpu->arch.cr3);
mmu_reset_needed = 1;
}
@@ -5691,7 +5315,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
!is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu_put(vcpu);
return 0;
}
@@ -5702,10 +5326,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
unsigned long rflags;
int i, r;
+ vcpu_load(vcpu);
+
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
r = -EBUSY;
if (vcpu->arch.exception.pending)
- goto out;
+ goto unlock_out;
if (dbg->control & KVM_GUESTDBG_INJECT_DB)
kvm_queue_exception(vcpu, DB_VECTOR);
else
@@ -5733,9 +5359,11 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
}
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ vcpu->arch.singlestep_cs =
+ get_segment_selector(vcpu, VCPU_SREG_CS);
+ vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
+ }
/*
* Trigger an rflags update that will inject or remove the trace
@@ -5747,12 +5375,34 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
r = 0;
-out:
+unlock_out:
+ vcpu_put(vcpu);
return r;
}
/*
+ * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
+ * we have asm/x86/processor.h
+ */
+struct fxsave {
+ u16 cwd;
+ u16 swd;
+ u16 twd;
+ u16 fop;
+ u64 rip;
+ u64 rdp;
+ u32 mxcsr;
+ u32 mxcsr_mask;
+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+#ifdef CONFIG_X86_64
+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+#else
+ u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+#endif
+};
+
+/*
* Translate a guest virtual address to a guest physical address.
*/
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5762,21 +5412,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
int idx;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ vcpu_load(vcpu);
+ idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
tr->usermode = 0;
+ vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct kvm_i387_fxsave_struct *fxsave =
- &vcpu->arch.guest_fpu.state->fxsave;
+ struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+ vcpu_load(vcpu);
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
@@ -5787,13 +5440,16 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fpu->last_dp = fxsave->rdp;
memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+ vcpu_put(vcpu);
+
return 0;
}
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct kvm_i387_fxsave_struct *fxsave =
- &vcpu->arch.guest_fpu.state->fxsave;
+ struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+ vcpu_load(vcpu);
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -5804,63 +5460,61 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fxsave->rdp = fpu->last_dp;
memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+ vcpu_put(vcpu);
+
return 0;
}
-int fx_init(struct kvm_vcpu *vcpu)
+void fx_init(struct kvm_vcpu *vcpu)
{
- int err;
-
- err = kvm_fpu_alloc(&vcpu->arch.guest_fpu);
- if (err)
- return err;
-
- kvm_fpu_finit(&vcpu->arch.guest_fpu);
+ unsigned after_mxcsr_mask;
/*
- * Ensure guest xcr0 is valid for loading
+ * Touch the fpu the first time in non atomic context as if
+ * this is the first fpu instruction the exception handler
+ * will fire before the instruction returns and it'll have to
+ * allocate ram with GFP_KERNEL.
*/
- vcpu->arch.xcr0 = XSTATE_FP;
+ if (!used_math())
+ kvm_fx_save(&vcpu->arch.host_fx_image);
- vcpu->arch.cr0 |= X86_CR0_ET;
+ /* Initialize guest FPU by resetting ours and saving into guest's */
+ preempt_disable();
+ kvm_fx_save(&vcpu->arch.host_fx_image);
+ kvm_fx_finit();
+ kvm_fx_save(&vcpu->arch.guest_fx_image);
+ kvm_fx_restore(&vcpu->arch.host_fx_image);
+ preempt_enable();
- return 0;
+ vcpu->arch.cr0 |= X86_CR0_ET;
+ after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
+ vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
+ memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
+ 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
}
EXPORT_SYMBOL_GPL(fx_init);
-static void fx_free(struct kvm_vcpu *vcpu)
-{
- kvm_fpu_free(&vcpu->arch.guest_fpu);
-}
-
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
if (vcpu->guest_fpu_loaded)
return;
- /*
- * Restore all possible states in the guest,
- * and assume host would use all available bits.
- * Guest xcr0 would be loaded later.
- */
- kvm_put_guest_xcr0(vcpu);
vcpu->guest_fpu_loaded = 1;
- unlazy_fpu(current);
- kvm_fpu_restore_checking(&vcpu->arch.guest_fpu);
+ kvm_fx_save(&vcpu->arch.host_fx_image);
+ kvm_fx_restore(&vcpu->arch.guest_fx_image);
trace_kvm_fpu(1);
}
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- kvm_put_guest_xcr0(vcpu);
-
if (!vcpu->guest_fpu_loaded)
return;
vcpu->guest_fpu_loaded = 0;
- kvm_fpu_save_init(&vcpu->arch.guest_fpu);
+ kvm_fx_save(&vcpu->arch.guest_fx_image);
+ kvm_fx_restore(&vcpu->arch.host_fx_image);
++vcpu->stat.fpu_reload;
- kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+ set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
trace_kvm_fpu(0);
}
@@ -5871,18 +5525,12 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
vcpu->arch.time_page = NULL;
}
- free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
- fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
unsigned int id)
{
- if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
- printk_once(KERN_WARNING
- "kvm: SMP vm created on host with unstable TSC; "
- "guest TSC will not be reliable\n");
return kvm_x86_ops->vcpu_create(kvm, id);
}
@@ -5890,6 +5538,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
int r;
+ /* We do fxsave: this must be aligned. */
+ BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+
vcpu->arch.mtrr_state.have_fixed = 1;
vcpu_load(vcpu);
r = kvm_arch_vcpu_reset(vcpu);
@@ -5907,13 +5558,10 @@ free_vcpu:
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- vcpu->arch.apf.msr_val = 0;
-
vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
- fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
@@ -5927,27 +5575,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.dr6 = DR6_FIXED_1;
vcpu->arch.dr7 = DR7_FIXED_1;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- vcpu->arch.apf.msr_val = 0;
-
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
- vcpu->arch.apf.halted = false;
-
return kvm_x86_ops->vcpu_reset(vcpu);
}
int kvm_arch_hardware_enable(void *garbage)
{
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
- int i;
+ /*
+ * Since this may be called from a hotplug notifcation,
+ * we can't get the CPU frequency directly.
+ */
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ int cpu = raw_smp_processor_id();
+ per_cpu(cpu_tsc_khz, cpu) = 0;
+ }
kvm_shared_msr_cpu_online();
- list_for_each_entry(kvm, &vm_list, vm_list)
- kvm_for_each_vcpu(i, vcpu, kvm)
- if (vcpu->cpu == smp_processor_id())
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
return kvm_x86_ops->hardware_enable(garbage);
}
@@ -5981,11 +5624,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- vcpu->arch.walk_mmu = &vcpu->arch.mmu;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.mmu.translate_gpa = translate_gpa;
- vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
@@ -5998,9 +5637,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.pio_data = page_address(page);
- if (!kvm->arch.virtual_tsc_khz)
- kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
-
r = kvm_mmu_create(vcpu);
if (r < 0)
goto fail_free_pio_data;
@@ -6019,14 +5655,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
- goto fail_free_mce_banks;
-
- kvm_async_pf_hash_reset(vcpu);
-
return 0;
-fail_free_mce_banks:
- kfree(vcpu->arch.mce_banks);
fail_free_lapic:
kvm_free_lapic(vcpu);
fail_mmu_destroy:
@@ -6043,23 +5672,34 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
kfree(vcpu->arch.mce_banks);
kvm_free_lapic(vcpu);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
kvm_mmu_destroy(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
}
-int kvm_arch_init_vm(struct kvm *kvm)
+struct kvm *kvm_arch_create_vm(void)
{
+ struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+ if (!kvm)
+ return ERR_PTR(-ENOMEM);
+
+ kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!kvm->arch.aliases) {
+ kfree(kvm);
+ return ERR_PTR(-ENOMEM);
+ }
+
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- spin_lock_init(&kvm->arch.tsc_write_lock);
+ rdtscll(kvm->arch.vm_init_tsc);
- return 0;
+ return kvm;
}
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -6077,10 +5717,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
/*
* Unpin any mmu pages first.
*/
- kvm_for_each_vcpu(i, vcpu, kvm) {
- kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_for_each_vcpu(i, vcpu, kvm)
kvm_unload_vcpu_mmu(vcpu);
- }
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_arch_vcpu_free(vcpu);
@@ -6095,19 +5733,23 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_sync_events(struct kvm *kvm)
{
kvm_free_all_assigned_devices(kvm);
- kvm_free_pit(kvm);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvm_iommu_unmap_guest(kvm);
+ kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
+ kvm_free_physmem(kvm);
if (kvm->arch.apic_access_page)
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
+ kvm_cleanup_srcu_struct(&kvm->srcu);
+ kfree(kvm->arch.aliases);
+ kfree(kvm);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6117,11 +5759,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
int user_alloc)
{
int npages = memslot->npages;
- int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
-
- /* Prevent internal slot pages from being moved by fork()/COW. */
- if (memslot->id >= KVM_MEMORY_SLOTS)
- map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
*x86 needs to hanlde !user_alloc case.
@@ -6134,7 +5771,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
- map_flags,
+ MAP_PRIVATE | MAP_ANONYMOUS,
0);
up_write(&current->mm->mmap_sem);
@@ -6188,9 +5825,7 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
- !vcpu->arch.apf.halted)
- || !list_empty_careful(&vcpu->async_pf.done)
+ return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
|| vcpu->arch.nmi_pending ||
(kvm_arch_interrupt_allowed(vcpu) &&
@@ -6209,7 +5844,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
me = get_cpu();
if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
- if (atomic_xchg(&vcpu->guest_mode, 0))
+ if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
kvm_smp_send_reschedule(cpu);
put_cpu();
}
@@ -6219,22 +5854,13 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
return kvm_x86_ops->interrupt_allowed(vcpu);
}
-bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
-{
- unsigned long current_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
-
- return current_rip == linear_rip;
-}
-EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
-
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
rflags = kvm_x86_ops->get_rflags(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- rflags &= ~X86_EFLAGS_TF;
+ rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
return rflags;
}
EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -6242,154 +5868,14 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
- kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
- rflags |= X86_EFLAGS_TF;
+ vcpu->arch.singlestep_cs ==
+ get_segment_selector(vcpu, VCPU_SREG_CS) &&
+ vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
+ rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
kvm_x86_ops->set_rflags(vcpu, rflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
-void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
-{
- int r;
-
- if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
- is_error_page(work->page))
- return;
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- return;
-
- if (!vcpu->arch.mmu.direct_map &&
- work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
- return;
-
- vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
-}
-
-static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
-{
- return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
-}
-
-static inline u32 kvm_async_pf_next_probe(u32 key)
-{
- return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
-}
-
-static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u32 key = kvm_async_pf_hash_fn(gfn);
-
- while (vcpu->arch.apf.gfns[key] != ~0)
- key = kvm_async_pf_next_probe(key);
-
- vcpu->arch.apf.gfns[key] = gfn;
-}
-
-static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- int i;
- u32 key = kvm_async_pf_hash_fn(gfn);
-
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
- (vcpu->arch.apf.gfns[key] != gfn &&
- vcpu->arch.apf.gfns[key] != ~0); i++)
- key = kvm_async_pf_next_probe(key);
-
- return key;
-}
-
-bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
-}
-
-static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u32 i, j, k;
-
- i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
- while (true) {
- vcpu->arch.apf.gfns[i] = ~0;
- do {
- j = kvm_async_pf_next_probe(j);
- if (vcpu->arch.apf.gfns[j] == ~0)
- return;
- k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
- /*
- * k lies cyclically in ]i,j]
- * | i.k.j |
- * |....j i.k.| or |.k..j i...|
- */
- } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
- vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
- i = j;
- }
-}
-
-static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
-{
-
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
- sizeof(val));
-}
-
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
- struct kvm_async_pf *work)
-{
- struct x86_exception fault;
-
- trace_kvm_async_pf_not_present(work->arch.token, work->gva);
- kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
-
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
- (vcpu->arch.apf.send_user_only &&
- kvm_x86_ops->get_cpl(vcpu) == 0))
- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
- else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
- fault.vector = PF_VECTOR;
- fault.error_code_valid = true;
- fault.error_code = 0;
- fault.nested_page_fault = false;
- fault.address = work->arch.token;
- kvm_inject_page_fault(vcpu, &fault);
- }
-}
-
-void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
- struct kvm_async_pf *work)
-{
- struct x86_exception fault;
-
- trace_kvm_async_pf_ready(work->arch.token, work->gva);
- if (is_error_page(work->page))
- work->arch.token = ~0; /* broadcast wakeup */
- else
- kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
-
- if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
- !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
- fault.vector = PF_VECTOR;
- fault.error_code_valid = true;
- fault.error_code = 0;
- fault.nested_page_fault = false;
- fault.address = work->arch.token;
- kvm_inject_page_fault(vcpu, &fault);
- }
- vcpu->arch.apf.halted = false;
-}
-
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
-{
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
- return true;
- else
- return !kvm_event_needs_reinjection(vcpu) &&
- kvm_x86_ops->interrupt_allowed(vcpu);
-}
-
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
@@ -6401,4 +5887,3 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/linux/x86/x86.h b/linux/x86/x86.h
index c600da8..2d10163 100644
--- a/linux/x86/x86.h
+++ b/linux/x86/x86.h
@@ -50,11 +50,6 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
#endif
}
-static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
-}
-
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -70,15 +65,4 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
}
-static inline u32 bit(int bitno)
-{
- return 1 << (bitno & 31);
-}
-
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
-
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
-
#endif