summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormax <max@maxpad.(none)>2010-12-03 19:24:27 +0100
committermax <max@maxpad.(none)>2010-12-03 19:24:27 +0100
commit51f789529b937da1567a085e27be4fd296faed00 (patch)
treef522ee59e4df38fe6241da791e1b0271af9c8fcc
parentaaf4078a2967dbd67bf0efad9c3f4b81ab35e665 (diff)
downloadillumos-kvm-51f789529b937da1567a085e27be4fd296faed00.tar.gz
Adding several new files, not all are needed...
-rw-r--r--coalesced_mmio.h51
-rw-r--r--emulate.c2845
-rw-r--r--hyperv.h184
-rw-r--r--iodev.h69
-rw-r--r--irq.h103
-rw-r--r--kvm_emulate.h198
-rw-r--r--kvm_para.h40
-rw-r--r--kvm_types.h67
-rw-r--r--kvm_x86host.h964
-rw-r--r--segment.h217
-rw-r--r--tss.h59
11 files changed, 4797 insertions, 0 deletions
diff --git a/coalesced_mmio.h b/coalesced_mmio.h
new file mode 100644
index 0000000..30f878e
--- /dev/null
+++ b/coalesced_mmio.h
@@ -0,0 +1,51 @@
+#ifndef __KVM_COALESCED_MMIO_H__
+#define __KVM_COALESCED_MMIO_H__
+
+/*
+ * KVM coalesced MMIO
+ *
+ * Copyright (c) 2008 Bull S.A.S.
+ *
+ * Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ */
+
+struct kvm_coalesced_mmio_zone {
+ uint64_t addr;
+ uint32_t size;
+ uint32_t pad;
+};
+
+#ifdef CONFIG_KVM_MMIO
+
+#define KVM_COALESCED_MMIO_ZONE_MAX 100
+
+/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
+
+
+#ifdef _KERNEL
+
+struct kvm_coalesced_mmio_dev {
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+ kmutex_t lock;
+ int nb_zones;
+ struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
+};
+
+int kvm_coalesced_mmio_init(struct kvm *kvm);
+void kvm_coalesced_mmio_free(struct kvm *kvm);
+int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
+ struct kvm_coalesced_mmio_zone *zone);
+int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
+ struct kvm_coalesced_mmio_zone *zone);
+#endif /*_KERNEL*/
+
+#else
+
+static inline int kvm_coalesced_mmio_init(struct kvm *kvm) { return 0; }
+static inline void kvm_coalesced_mmio_free(struct kvm *kvm) { }
+
+#endif
+
+#endif
diff --git a/emulate.c b/emulate.c
new file mode 100644
index 0000000..df6a70a
--- /dev/null
+++ b/emulate.c
@@ -0,0 +1,2845 @@
+/******************************************************************************
+ * emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ *
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/modctl.h>
+#include <sys/open.h>
+#include <sys/kmem.h>
+#include <sys/poll.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/atomic.h>
+#include <sys/spl.h>
+#include <sys/cpuvar.h>
+
+#ifndef _KERNEL
+#include <stdio.h>
+#include <stdint.h>
+#include <public/xen.h>
+#define DPRINTF(_f, _a ...) printf(_f , ## _a)
+#else
+#include "kvm_host.h"
+#include "kvm_x86host.h"
+#define DPRINTF(x...) do {} while (0)
+#endif
+
+#include "msr-index.h"
+#include "msr.h"
+#include "processor-flags.h"
+#include "iodev.h"
+#include "kvm.h"
+
+/* Indirect stringification. Doing two levels allows the parameter to be a
+ * macro itself. For example, compile with -DFOO=bar, __stringify(FOO)
+ * converts to "bar".
+ */
+
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp (1<<0) /* 8-bit operands. */
+/* Destination operand type. */
+#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
+#define DstReg (2<<1) /* Register operand. */
+#define DstMem (3<<1) /* Memory operand. */
+#define DstAcc (4<<1) /* Destination Accumulator */
+#define DstMask (7<<1)
+/* Source operand type. */
+#define SrcNone (0<<4) /* No source operand. */
+#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
+#define SrcReg (1<<4) /* Register operand. */
+#define SrcMem (2<<4) /* Memory operand. */
+#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
+#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
+#define SrcImm (5<<4) /* Immediate operand. */
+#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
+#define SrcOne (7<<4) /* Implied '1' */
+#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
+#define SrcImmU (9<<4) /* Immediate operand, unsigned */
+#define SrcMask (0xf<<4)
+/* Generic ModRM decode. */
+#define ModRM (1<<8)
+/* Destination is only written; never read. */
+#define Mov (1<<9)
+#define BitOp (1<<10)
+#define MemAbs (1<<11) /* Memory operand is absolute displacement */
+#define String (1<<12) /* String instruction (rep capable) */
+#define Stack (1<<13) /* Stack instruction (push/pop) */
+#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
+#define GroupMask 0xff /* Group number stored in bits 0:7 */
+/* Misc flags */
+#define Lock (1<<26) /* lock prefix is allowed for the instruction */
+#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
+#define No64 (1<<28)
+/* Source 2 operand type */
+#define Src2None (0<<29)
+#define Src2CL (1<<29)
+#define Src2ImmByte (2<<29)
+#define Src2One (3<<29)
+#define Src2Imm16 (4<<29)
+#define Src2Mask (7<<29)
+
+enum {
+ Group1_80, Group1_81, Group1_82, Group1_83,
+ Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
+ Group8, Group9,
+};
+
+static uint32_t opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+ /* 0x08 - 0x0F */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, 0,
+ /* 0x10 - 0x17 */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+ /* 0x18 - 0x1F */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+ /* 0x20 - 0x27 */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+ /* 0x28 - 0x2F */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x30 - 0x37 */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x38 - 0x3F */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ 0, 0,
+ /* 0x40 - 0x47 */
+ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+ /* 0x48 - 0x4F */
+ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+ /* 0x50 - 0x57 */
+ SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+ SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+ /* 0x58 - 0x5F */
+ DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+ DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+ /* 0x60 - 0x67 */
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+ 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
+ 0, 0, 0, 0,
+ /* 0x68 - 0x6F */
+ SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
+ /* 0x70 - 0x77 */
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ /* 0x78 - 0x7F */
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ /* 0x80 - 0x87 */
+ Group | Group1_80, Group | Group1_81,
+ Group | Group1_82, Group | Group1_83,
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ /* 0x88 - 0x8F */
+ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
+ ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
+ DstReg | SrcMem | ModRM | Mov, Group | Group1A,
+ /* 0x90 - 0x97 */
+ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+ /* 0x98 - 0x9F */
+ 0, 0, SrcImm | Src2Imm16 | No64, 0,
+ ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+ /* 0xA0 - 0xA7 */
+ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
+ ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+ ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+ ByteOp | ImplicitOps | String, ImplicitOps | String,
+ /* 0xA8 - 0xAF */
+ 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+ ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+ ByteOp | ImplicitOps | String, ImplicitOps | String,
+ /* 0xB0 - 0xB7 */
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ /* 0xB8 - 0xBF */
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ /* 0xC0 - 0xC7 */
+ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+ 0, ImplicitOps | Stack, 0, 0,
+ ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
+ /* 0xC8 - 0xCF */
+ 0, 0, 0, ImplicitOps | Stack,
+ ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
+ /* 0xD0 - 0xD7 */
+ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+ 0, 0, 0, 0,
+ /* 0xD8 - 0xDF */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xE0 - 0xE7 */
+ 0, 0, 0, 0,
+ ByteOp | SrcImmUByte, SrcImmUByte,
+ ByteOp | SrcImmUByte, SrcImmUByte,
+ /* 0xE8 - 0xEF */
+ SrcImm | Stack, SrcImm | ImplicitOps,
+ SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ /* 0xF0 - 0xF7 */
+ 0, 0, 0, 0,
+ ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
+ /* 0xF8 - 0xFF */
+ ImplicitOps, 0, ImplicitOps, ImplicitOps,
+ ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
+};
+
+static uint32_t twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ 0, Group | GroupDual | Group7, 0, 0,
+ 0, ImplicitOps, ImplicitOps | Priv, 0,
+ ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
+ 0, ImplicitOps | ModRM, 0, 0,
+ /* 0x10 - 0x1F */
+ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x20 - 0x2F */
+ ModRM | ImplicitOps | Priv, ModRM | Priv,
+ ModRM | ImplicitOps | Priv, ModRM | Priv,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x30 - 0x3F */
+ ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
+ ImplicitOps, ImplicitOps | Priv, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x40 - 0x47 */
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ /* 0x48 - 0x4F */
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ /* 0x50 - 0x5F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x60 - 0x6F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x70 - 0x7F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x80 - 0x8F */
+ SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
+ SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
+ /* 0x90 - 0x9F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xA0 - 0xA7 */
+ ImplicitOps | Stack, ImplicitOps | Stack,
+ 0, DstMem | SrcReg | ModRM | BitOp,
+ DstMem | SrcReg | Src2ImmByte | ModRM,
+ DstMem | SrcReg | Src2CL | ModRM, 0, 0,
+ /* 0xA8 - 0xAF */
+ ImplicitOps | Stack, ImplicitOps | Stack,
+ 0, DstMem | SrcReg | ModRM | BitOp | Lock,
+ DstMem | SrcReg | Src2ImmByte | ModRM,
+ DstMem | SrcReg | Src2CL | ModRM,
+ ModRM, 0,
+ /* 0xB0 - 0xB7 */
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ 0, DstMem | SrcReg | ModRM | BitOp | Lock,
+ 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem16 | ModRM | Mov,
+ /* 0xB8 - 0xBF */
+ 0, 0,
+ Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
+ 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem16 | ModRM | Mov,
+ /* 0xC0 - 0xCF */
+ 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
+ 0, 0, 0, Group | GroupDual | Group9,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xD0 - 0xDF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xE0 - 0xEF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xF0 - 0xFF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static uint32_t group_table[] = {
+ [Group1_80*8] =
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM,
+ [Group1_81*8] =
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM,
+ [Group1_82*8] =
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64,
+ [Group1_83*8] =
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM,
+ [Group1A*8] =
+ DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
+ [Group3_Byte*8] =
+ ByteOp | SrcImm | DstMem | ModRM, 0,
+ ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0,
+ [Group3*8] =
+ DstMem | SrcImm | ModRM, 0,
+ DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0,
+ [Group4*8] =
+ ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0, 0, 0,
+ [Group5*8] =
+ DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ SrcMem | ModRM | Stack, 0,
+ SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
+ [Group7*8] =
+ 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
+ SrcNone | ModRM | DstMem | Mov, 0,
+ SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
+ [Group8*8] =
+ 0, 0, 0, 0,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
+ [Group9*8] =
+ 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
+};
+
+static uint32_t group2_table[] = {
+ [Group7*8] =
+ SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
+ SrcNone | ModRM | DstMem | Mov, 0,
+ SrcMem16 | ModRM | Mov, 0,
+ [Group9*8] =
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* EFLAGS bit definitions. */
+#define EFLG_ID (1<<21)
+#define EFLG_VIP (1<<20)
+#define EFLG_VIF (1<<19)
+#define EFLG_AC (1<<18)
+#define EFLG_VM (1<<17)
+#define EFLG_RF (1<<16)
+#define EFLG_IOPL (3<<12)
+#define EFLG_NT (1<<14)
+#define EFLG_OF (1<<11)
+#define EFLG_DF (1<<10)
+#define EFLG_IF (1<<9)
+#define EFLG_TF (1<<8)
+#define EFLG_SF (1<<7)
+#define EFLG_ZF (1<<6)
+#define EFLG_AF (1<<4)
+#define EFLG_PF (1<<2)
+#define EFLG_CF (1<<0)
+
+/*
+ * Instruction emulation:
+ * Most instructions are emulated directly via a fragment of inline assembly
+ * code. This allows us to save/restore EFLAGS and thus very easily pick up
+ * any modified flags.
+ */
+
+#if defined(CONFIG_X86_64)
+#define _LO32 "k" /* force 32-bit operand */
+#define _STK "%%rsp" /* stack pointer */
+#elif defined(__i386__)
+#define _LO32 "" /* force 32-bit operand */
+#define _STK "%%esp" /* stack pointer */
+#endif
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+
+/* Before executing instruction: restore necessary bits in EFLAGS. */
+#define _PRE_EFLAGS(_sav, _msk, _tmp) \
+ /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
+ "movl %"_sav",%"_LO32 _tmp"; " \
+ "push %"_tmp"; " \
+ "push %"_tmp"; " \
+ "movl %"_msk",%"_LO32 _tmp"; " \
+ "andl %"_LO32 _tmp",("_STK"); " \
+ "pushf; " \
+ "notl %"_LO32 _tmp"; " \
+ "andl %"_LO32 _tmp",("_STK"); " \
+ "andl %"_LO32 _tmp","__stringify(BT_NBIPUL/4)"("_STK"); " \
+ "pop %"_tmp"; " \
+ "orl %"_LO32 _tmp",("_STK"); " \
+ "popf; " \
+ "pop %"_sav"; "
+
+/* After executing instruction: write-back necessary bits in EFLAGS. */
+#define _POST_EFLAGS(_sav, _msk, _tmp) \
+ /* _sav |= EFLAGS & _msk; */ \
+ "pushf; " \
+ "pop %"_tmp"; " \
+ "andl %"_msk",%"_LO32 _tmp"; " \
+ "orl %"_LO32 _tmp",%"_sav"; "
+
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \
+ do { \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "2") \
+ _op _suffix " %"_x"3,%1; " \
+ _POST_EFLAGS("0", "4", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : _y ((_src).val), "i" (EFLAGS_MASK)); \
+ } while (0)
+
+
+/* Raw emulation: instruction has two explicit operands. */
+#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
+ do { \
+ unsigned long _tmp; \
+ \
+ switch ((_dst).bytes) { \
+ case 2: \
+ ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+ break; \
+ case 4: \
+ ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+ break; \
+ case 8: \
+ ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+ break; \
+ } \
+ } while (0)
+
+#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
+ do { \
+ unsigned long _tmp; \
+ switch ((_dst).bytes) { \
+ case 1: \
+ ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \
+ break; \
+ default: \
+ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
+ _wx, _wy, _lx, _ly, _qx, _qy); \
+ break; \
+ } \
+ } while (0)
+
+/* Source operand is byte-sized and may be restricted to just %cl. */
+#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
+ __emulate_2op(_op, _src, _dst, _eflags, \
+ "b", "c", "b", "c", "b", "c", "b", "c")
+
+/* Source operand is byte, word, long or quad sized. */
+#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
+ __emulate_2op(_op, _src, _dst, _eflags, \
+ "b", "q", "w", "r", _LO32, "r", "", "r")
+
+/* Source operand is word, long or quad sized. */
+#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
+ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
+ "w", "r", _LO32, "r", "", "r")
+
+/* Instruction has three operands and one operand is stored in ECX register */
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
+ do { \
+ unsigned long _tmp; \
+ _type _clv = (_cl).val; \
+ _type _srcv = (_src).val; \
+ _type _dstv = (_dst).val; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "5", "2") \
+ _op _suffix " %4,%1 \n" \
+ _POST_EFLAGS("0", "5", "2") \
+ : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
+ : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
+ ); \
+ \
+ (_cl).val = (unsigned long) _clv; \
+ (_src).val = (unsigned long) _srcv; \
+ (_dst).val = (unsigned long) _dstv; \
+ } while (0)
+
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
+ do { \
+ switch ((_dst).bytes) { \
+ case 2: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "w", unsigned short); \
+ break; \
+ case 4: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "l", unsigned int); \
+ break; \
+ case 8: \
+ ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "q", unsigned long)); \
+ break; \
+ } \
+ } while (0)
+
+#define __emulate_1op(_op, _dst, _eflags, _suffix) \
+ do { \
+ unsigned long _tmp; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "3", "2") \
+ _op _suffix " %1; " \
+ _POST_EFLAGS("0", "3", "2") \
+ : "=m" (_eflags), "+m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : "i" (EFLAGS_MASK)); \
+ } while (0)
+
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags) \
+ do { \
+ switch ((_dst).bytes) { \
+ case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \
+ case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \
+ case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \
+ case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
+ } \
+ } while (0)
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip) \
+({ unsigned long _x; \
+ rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
+ if (rc != 0) \
+ goto done; \
+ (_eip) += (_size); \
+ (_type)_x; \
+})
+
+static inline unsigned long ad_mask(struct decode_cache *c)
+{
+ return (1UL << (c->ad_bytes << 3)) - 1;
+}
+
+/* Access/update address held in a register, based on addressing mode. */
+static inline unsigned long
+address_mask(struct decode_cache *c, unsigned long reg)
+{
+ if (c->ad_bytes == sizeof(unsigned long))
+ return reg;
+ else
+ return reg & ad_mask(c);
+}
+
+static inline unsigned long
+register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
+{
+ return base + address_mask(c, reg);
+}
+
+static inline void
+register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+{
+ if (c->ad_bytes == sizeof(unsigned long))
+ *reg += inc;
+ else
+ *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+}
+
+static inline void jmp_rel(struct decode_cache *c, int rel)
+{
+ register_address_increment(c, &c->eip, rel);
+}
+
+static void set_seg_override(struct decode_cache *c, int seg)
+{
+ c->has_seg_override = 1;
+ c->seg_override = seg;
+}
+
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+{
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
+ return 0;
+
+ return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
+}
+
+static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
+ struct decode_cache *c)
+{
+ if (!c->has_seg_override)
+ return 0;
+
+ return seg_base(ctxt, c->seg_override);
+}
+
+static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
+{
+ return seg_base(ctxt, VCPU_SREG_ES);
+}
+
+static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
+{
+ return seg_base(ctxt, VCPU_SREG_SS);
+}
+
+static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long linear, uint8_t *dest)
+{
+ struct fetch_cache *fc = &ctxt->decode.fetch;
+ int rc;
+ int size;
+
+ if (linear < fc->start || linear >= fc->end) {
+ size = min(15UL, PAGESIZE - offset_in_page(linear));
+ rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
+ if (rc)
+ return rc;
+ fc->start = linear;
+ fc->end = linear + size;
+ }
+ *dest = fc->data[linear - fc->start];
+ return 0;
+}
+
+static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long eip, void *dest, unsigned size)
+{
+ int rc = 0;
+
+ /* x86 instructions are limited to 15 bytes. */
+ if (eip + size - ctxt->decode.eip_orig > 15)
+ return X86EMUL_UNHANDLEABLE;
+ eip += ctxt->cs_base;
+ while (size--) {
+ rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(uint8_t modrm_reg, unsigned long *regs,
+ int highbyte_regs)
+{
+ void *p;
+
+ p = &regs[modrm_reg];
+ if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+ p = (unsigned char *)&regs[modrm_reg & 3] + 1;
+ return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ void *ptr,
+ uint16_t *size, unsigned long *address, int op_bytes)
+{
+ int rc;
+
+ if (op_bytes == 2)
+ op_bytes = 3;
+ *address = 0;
+ rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
+ ctxt->vcpu, NULL);
+ if (rc)
+ return rc;
+ rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
+ ctxt->vcpu, NULL);
+ return rc;
+}
+
+static int test_cc(unsigned int condition, unsigned int flags)
+{
+ int rc = 0;
+
+ switch ((condition & 15) >> 1) {
+ case 0: /* o */
+ rc |= (flags & EFLG_OF);
+ break;
+ case 1: /* b/c/nae */
+ rc |= (flags & EFLG_CF);
+ break;
+ case 2: /* z/e */
+ rc |= (flags & EFLG_ZF);
+ break;
+ case 3: /* be/na */
+ rc |= (flags & (EFLG_CF|EFLG_ZF));
+ break;
+ case 4: /* s */
+ rc |= (flags & EFLG_SF);
+ break;
+ case 5: /* p/pe */
+ rc |= (flags & EFLG_PF);
+ break;
+ case 7: /* le/ng */
+ rc |= (flags & EFLG_ZF);
+ /* fall through */
+ case 6: /* l/nge */
+ rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
+ break;
+ }
+
+ /* Odd condition identifiers (lsb == 1) have inverted sense. */
+ return (!!rc ^ (condition & 1));
+}
+
+static void decode_register_operand(struct operand *op,
+ struct decode_cache *c,
+ int inhibit_bytereg)
+{
+ unsigned reg = c->modrm_reg;
+ int highbyte_regs = c->rex_prefix == 0;
+
+ if (!(c->d & ModRM))
+ reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+ op->type = OP_REG;
+ if ((c->d & ByteOp) && !inhibit_bytereg) {
+ op->ptr = decode_register(reg, c->regs, highbyte_regs);
+ op->val = *(uint8_t *)op->ptr;
+ op->bytes = 1;
+ } else {
+ op->ptr = decode_register(reg, c->regs, 0);
+ op->bytes = c->op_bytes;
+ switch (op->bytes) {
+ case 2:
+ op->val = *(uint16_t *)op->ptr;
+ break;
+ case 4:
+ op->val = *(uint32_t *)op->ptr;
+ break;
+ case 8:
+ op->val = *(uint64_t *) op->ptr;
+ break;
+ }
+ }
+ op->orig_val = op->val;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ uint8_t sib;
+ int index_reg = 0, base_reg = 0, scale;
+ int rc = 0;
+
+ if (c->rex_prefix) {
+ c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
+ index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
+ c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+ }
+
+ c->modrm = insn_fetch(uint8_t, 1, c->eip);
+ c->modrm_mod |= (c->modrm & 0xc0) >> 6;
+ c->modrm_reg |= (c->modrm & 0x38) >> 3;
+ c->modrm_rm |= (c->modrm & 0x07);
+ c->modrm_ea = 0;
+ c->use_modrm_ea = 1;
+
+ if (c->modrm_mod == 3) {
+ c->modrm_ptr = decode_register(c->modrm_rm,
+ c->regs, c->d & ByteOp);
+ c->modrm_val = *(unsigned long *)c->modrm_ptr;
+ return rc;
+ }
+
+ if (c->ad_bytes == 2) {
+ unsigned bx = c->regs[VCPU_REGS_RBX];
+ unsigned bp = c->regs[VCPU_REGS_RBP];
+ unsigned si = c->regs[VCPU_REGS_RSI];
+ unsigned di = c->regs[VCPU_REGS_RDI];
+
+ /* 16-bit ModR/M decode. */
+ switch (c->modrm_mod) {
+ case 0:
+ if (c->modrm_rm == 6)
+ c->modrm_ea += insn_fetch(uint16_t, 2, c->eip);
+ break;
+ case 1:
+ c->modrm_ea += insn_fetch(int8_t, 1, c->eip);
+ break;
+ case 2:
+ c->modrm_ea += insn_fetch(uint16_t, 2, c->eip);
+ break;
+ }
+ switch (c->modrm_rm) {
+ case 0:
+ c->modrm_ea += bx + si;
+ break;
+ case 1:
+ c->modrm_ea += bx + di;
+ break;
+ case 2:
+ c->modrm_ea += bp + si;
+ break;
+ case 3:
+ c->modrm_ea += bp + di;
+ break;
+ case 4:
+ c->modrm_ea += si;
+ break;
+ case 5:
+ c->modrm_ea += di;
+ break;
+ case 6:
+ if (c->modrm_mod != 0)
+ c->modrm_ea += bp;
+ break;
+ case 7:
+ c->modrm_ea += bx;
+ break;
+ }
+ if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
+ (c->modrm_rm == 6 && c->modrm_mod != 0))
+ if (!c->has_seg_override)
+ set_seg_override(c, VCPU_SREG_SS);
+ c->modrm_ea = (uint16_t)c->modrm_ea;
+ } else {
+ /* 32/64-bit ModR/M decode. */
+ if ((c->modrm_rm & 7) == 4) {
+ sib = insn_fetch(uint8_t, 1, c->eip);
+ index_reg |= (sib >> 3) & 7;
+ base_reg |= sib & 7;
+ scale = sib >> 6;
+
+ if ((base_reg & 7) == 5 && c->modrm_mod == 0)
+ c->modrm_ea += insn_fetch(int32_t, 4, c->eip);
+ else
+ c->modrm_ea += c->regs[base_reg];
+ if (index_reg != 4)
+ c->modrm_ea += c->regs[index_reg] << scale;
+ } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ c->rip_relative = 1;
+ } else
+ c->modrm_ea += c->regs[c->modrm_rm];
+ switch (c->modrm_mod) {
+ case 0:
+ if (c->modrm_rm == 5)
+ c->modrm_ea += insn_fetch(int32_t, 4, c->eip);
+ break;
+ case 1:
+ c->modrm_ea += insn_fetch(int8_t, 1, c->eip);
+ break;
+ case 2:
+ c->modrm_ea += insn_fetch(int32_t, 4, c->eip);
+ break;
+ }
+ }
+done:
+ return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = 0;
+
+ switch (c->ad_bytes) {
+ case 2:
+ c->modrm_ea = insn_fetch(uint16_t, 2, c->eip);
+ break;
+ case 4:
+ c->modrm_ea = insn_fetch(uint32_t, 4, c->eip);
+ break;
+ case 8:
+ c->modrm_ea = insn_fetch(uint64_t, 8, c->eip);
+ break;
+ }
+done:
+ return rc;
+}
+
+unsigned long kvm_rip_read(struct kvm_vcpu *vcpu);
+void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val);
+
+int
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = 0;
+ int mode = ctxt->mode;
+ int def_op_bytes, def_ad_bytes, group;
+
+ /* Shadow copy of register state. Committed on successful emulation. */
+
+ memset(c, 0, sizeof(struct decode_cache));
+ c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
+ ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
+ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+
+ switch (mode) {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ def_op_bytes = def_ad_bytes = 2;
+ break;
+ case X86EMUL_MODE_PROT32:
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86EMUL_MODE_PROT64:
+ def_op_bytes = 4;
+ def_ad_bytes = 8;
+ break;
+#endif
+ default:
+ return -1;
+ }
+
+ c->op_bytes = def_op_bytes;
+ c->ad_bytes = def_ad_bytes;
+
+ /* Legacy prefixes. */
+ for (;;) {
+ switch (c->b = insn_fetch(uint8_t, 1, c->eip)) {
+ case 0x66: /* operand-size override */
+ /* switch between 2/4 bytes */
+ c->op_bytes = def_op_bytes ^ 6;
+ break;
+ case 0x67: /* address-size override */
+ if (mode == X86EMUL_MODE_PROT64)
+ /* switch between 4/8 bytes */
+ c->ad_bytes = def_ad_bytes ^ 12;
+ else
+ /* switch between 2/4 bytes */
+ c->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x26: /* ES override */
+ case 0x2e: /* CS override */
+ case 0x36: /* SS override */
+ case 0x3e: /* DS override */
+ set_seg_override(c, (c->b >> 3) & 3);
+ break;
+ case 0x64: /* FS override */
+ case 0x65: /* GS override */
+ set_seg_override(c, c->b & 7);
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ c->rex_prefix = c->b;
+ continue;
+ case 0xf0: /* LOCK */
+ c->lock_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ c->rep_prefix = REPNE_PREFIX;
+ break;
+ case 0xf3: /* REP/REPE/REPZ */
+ c->rep_prefix = REPE_PREFIX;
+ break;
+ default:
+ goto done_prefixes;
+ }
+
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+ c->rex_prefix = 0;
+ }
+
+done_prefixes:
+
+ /* REX prefix. */
+ if (c->rex_prefix)
+ if (c->rex_prefix & 8)
+ c->op_bytes = 8; /* REX.W */
+
+ /* Opcode byte(s). */
+ c->d = opcode_table[c->b];
+ if (c->d == 0) {
+ /* Two-byte opcode? */
+ if (c->b == 0x0f) {
+ c->twobyte = 1;
+ c->b = insn_fetch(uint8_t, 1, c->eip);
+ c->d = twobyte_table[c->b];
+ }
+ }
+
+ if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+ cmn_err(CE_WARN, "invalid x86/64 instruction");
+ return -1;
+ }
+
+ if (c->d & Group) {
+ group = c->d & GroupMask;
+ c->modrm = insn_fetch(uint8_t, 1, c->eip);
+ --c->eip;
+
+ group = (group << 3) + ((c->modrm >> 3) & 7);
+ if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
+ c->d = group2_table[group];
+ else
+ c->d = group_table[group];
+ }
+
+ /* Unrecognised? */
+ if (c->d == 0) {
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ return -1;
+ }
+
+ if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+ c->op_bytes = 8;
+
+ /* ModRM and SIB bytes. */
+ if (c->d & ModRM)
+ rc = decode_modrm(ctxt, ops);
+ else if (c->d & MemAbs)
+ rc = decode_abs(ctxt, ops);
+ if (rc)
+ goto done;
+
+ if (!c->has_seg_override)
+ set_seg_override(c, VCPU_SREG_DS);
+
+ if (!(!c->twobyte && c->b == 0x8d))
+ c->modrm_ea += seg_override_base(ctxt, c);
+
+ if (c->ad_bytes != 8)
+ c->modrm_ea = (uint32_t)c->modrm_ea;
+ /*
+ * Decode and fetch the source operand: register, memory
+ * or immediate.
+ */
+ switch (c->d & SrcMask) {
+ case SrcNone:
+ break;
+ case SrcReg:
+ decode_register_operand(&c->src, c, 0);
+ break;
+ case SrcMem16:
+ c->src.bytes = 2;
+ goto srcmem_common;
+ case SrcMem32:
+ c->src.bytes = 4;
+ goto srcmem_common;
+ case SrcMem:
+ c->src.bytes = (c->d & ByteOp) ? 1 :
+ c->op_bytes;
+ /* Don't fetch the address for invlpg: it could be unmapped. */
+ if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
+ break;
+ srcmem_common:
+ /*
+ * For instructions with a ModR/M byte, switch to register
+ * access if Mod = 3.
+ */
+ if ((c->d & ModRM) && c->modrm_mod == 3) {
+ c->src.type = OP_REG;
+ c->src.val = c->modrm_val;
+ c->src.ptr = c->modrm_ptr;
+ break;
+ }
+ c->src.type = OP_MEM;
+ break;
+ case SrcImm:
+ case SrcImmU:
+ c->src.type = OP_IMM;
+ c->src.ptr = (unsigned long *)c->eip;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ if (c->src.bytes == 8)
+ c->src.bytes = 4;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val = insn_fetch(int8_t, 1, c->eip);
+ break;
+ case 2:
+ c->src.val = insn_fetch(int16_t, 2, c->eip);
+ break;
+ case 4:
+ c->src.val = insn_fetch(int32_t, 4, c->eip);
+ break;
+ }
+ if ((c->d & SrcMask) == SrcImmU) {
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val &= 0xff;
+ break;
+ case 2:
+ c->src.val &= 0xffff;
+ break;
+ case 4:
+ c->src.val &= 0xffffffff;
+ break;
+ }
+ }
+ break;
+ case SrcImmByte:
+ case SrcImmUByte:
+ c->src.type = OP_IMM;
+ c->src.ptr = (unsigned long *)c->eip;
+ c->src.bytes = 1;
+ if ((c->d & SrcMask) == SrcImmByte)
+ c->src.val = insn_fetch(int8_t, 1, c->eip);
+ else
+ c->src.val = insn_fetch(uint8_t, 1, c->eip);
+ break;
+ case SrcOne:
+ c->src.bytes = 1;
+ c->src.val = 1;
+ break;
+ }
+
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
+ */
+ switch (c->d & Src2Mask) {
+ case Src2None:
+ break;
+ case Src2CL:
+ c->src2.bytes = 1;
+ c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+ break;
+ case Src2ImmByte:
+ c->src2.type = OP_IMM;
+ c->src2.ptr = (unsigned long *)c->eip;
+ c->src2.bytes = 1;
+ c->src2.val = insn_fetch(uint8_t, 1, c->eip);
+ break;
+ case Src2Imm16:
+ c->src2.type = OP_IMM;
+ c->src2.ptr = (unsigned long *)c->eip;
+ c->src2.bytes = 2;
+ c->src2.val = insn_fetch(uint16_t, 2, c->eip);
+ break;
+ case Src2One:
+ c->src2.bytes = 1;
+ c->src2.val = 1;
+ break;
+ }
+
+ /* Decode and fetch the destination operand: register or memory. */
+ switch (c->d & DstMask) {
+ case ImplicitOps:
+ /* Special instructions do their own operand decoding. */
+ return 0;
+ case DstReg:
+ decode_register_operand(&c->dst, c,
+ c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+ break;
+ case DstMem:
+ if ((c->d & ModRM) && c->modrm_mod == 3) {
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.type = OP_REG;
+ c->dst.val = c->dst.orig_val = c->modrm_val;
+ c->dst.ptr = c->modrm_ptr;
+ break;
+ }
+ c->dst.type = OP_MEM;
+ break;
+ case DstAcc:
+ c->dst.type = OP_REG;
+ c->dst.bytes = c->op_bytes;
+ c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+ switch (c->op_bytes) {
+ case 1:
+ c->dst.val = *(uint8_t *)c->dst.ptr;
+ break;
+ case 2:
+ c->dst.val = *(uint16_t *)c->dst.ptr;
+ break;
+ case 4:
+ c->dst.val = *(uint32_t *)c->dst.ptr;
+ break;
+ }
+ c->dst.orig_val = c->dst.val;
+ break;
+ }
+
+ if (c->rip_relative)
+ c->modrm_ea += c->eip;
+
+done:
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.type = OP_MEM;
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = c->src.val;
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
+ c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
+ c->regs[VCPU_REGS_RSP]);
+}
+
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ void *dest, int len)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ rc = ops->read_emulated(register_address(c, ss_base(ctxt),
+ c->regs[VCPU_REGS_RSP]),
+ dest, len, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
+ return rc;
+}
+
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ void *dest, int len)
+{
+ int rc;
+ unsigned long val, change_mask;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
+
+ rc = emulate_pop(ctxt, ops, &val, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
+ | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
+
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_PROT64:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT16:
+ if (cpl == 0)
+ change_mask |= EFLG_IOPL;
+ if (cpl <= iopl)
+ change_mask |= EFLG_IF;
+ break;
+ case X86EMUL_MODE_VM86:
+ if (iopl < 3) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ change_mask |= EFLG_IF;
+ break;
+ default: /* real mode */
+ change_mask |= (EFLG_IOPL | EFLG_IF);
+ break;
+ }
+
+ *(unsigned long *)dest =
+ (ctxt->eflags & ~change_mask) | (val & change_mask);
+
+ return rc;
+}
+
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment segment;
+
+ kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+
+ c->src.val = segment.selector;
+ emulate_push(ctxt);
+}
+
+static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
+{
+ struct decode_cache *c = &ctxt->decode;
+ unsigned long selector;
+ int rc;
+
+ rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
+ if (rc != 0)
+ return rc;
+
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, (uint16_t)selector, seg);
+ return rc;
+}
+
+static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+ int reg = VCPU_REGS_RAX;
+
+ while (reg <= VCPU_REGS_RDI) {
+ (reg == VCPU_REGS_RSP) ?
+ (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
+
+ emulate_push(ctxt);
+ ++reg;
+ }
+}
+
+static int emulate_popa(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = 0;
+ int reg = VCPU_REGS_RDI;
+
+ while (reg >= VCPU_REGS_RAX) {
+ if (reg == VCPU_REGS_RSP) {
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP],
+ c->op_bytes);
+ --reg;
+ }
+
+ rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
+ if (rc != 0)
+ break;
+ --reg;
+ }
+ return rc;
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
+ if (rc != 0)
+ return rc;
+ return 0;
+}
+
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ switch (c->modrm_reg) {
+ case 0: /* rol */
+ emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+ break;
+ case 1: /* ror */
+ emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+ break;
+ case 2: /* rcl */
+ emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+ break;
+ case 3: /* rcr */
+ emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+ break;
+ case 4: /* sal/shl */
+ case 6: /* sal/shl */
+ emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+ break;
+ case 5: /* shr */
+ emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+ break;
+ case 7: /* sar */
+ emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+ break;
+ }
+}
+
+static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = 0;
+
+ switch (c->modrm_reg) {
+ case 0 ... 1: /* test */
+ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+ break;
+ case 2: /* not */
+ c->dst.val = ~c->dst.val;
+ break;
+ case 3: /* neg */
+ emulate_1op("neg", c->dst, ctxt->eflags);
+ break;
+ default:
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ rc = X86EMUL_UNHANDLEABLE;
+ break;
+ }
+ return rc;
+}
+
+static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ switch (c->modrm_reg) {
+ case 0: /* inc */
+ emulate_1op("inc", c->dst, ctxt->eflags);
+ break;
+ case 1: /* dec */
+ emulate_1op("dec", c->dst, ctxt->eflags);
+ break;
+ case 2: /* call near abs */ {
+ long int old_eip;
+ old_eip = c->eip;
+ c->eip = c->src.val;
+ c->src.val = old_eip;
+ emulate_push(ctxt);
+ break;
+ }
+ case 4: /* jmp abs */
+ c->eip = c->src.val;
+ break;
+ case 6: /* push */
+ emulate_push(ctxt);
+ break;
+ }
+ return 0;
+}
+
+static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long memop)
+{
+ struct decode_cache *c = &ctxt->decode;
+ uint64_t old, new;
+ int rc;
+
+ rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (((uint32_t) (old >> 0) != (uint32_t) c->regs[VCPU_REGS_RAX]) ||
+ ((uint32_t) (old >> 32) != (uint32_t) c->regs[VCPU_REGS_RDX])) {
+
+ c->regs[VCPU_REGS_RAX] = (uint32_t) (old >> 0);
+ c->regs[VCPU_REGS_RDX] = (uint32_t) (old >> 32);
+ ctxt->eflags &= ~EFLG_ZF;
+
+ } else {
+ new = ((uint64_t)c->regs[VCPU_REGS_RCX] << 32) |
+ (uint32_t) c->regs[VCPU_REGS_RBX];
+
+ rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->eflags |= EFLG_ZF;
+ }
+ return 0;
+}
+
+static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+ unsigned long cs;
+
+ rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
+ if (rc)
+ return rc;
+ if (c->op_bytes == 4)
+ c->eip = (uint32_t)c->eip;
+ rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+ if (rc)
+ return rc;
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, (uint16_t)cs, VCPU_SREG_CS);
+ return rc;
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ int rc;
+ struct decode_cache *c = &ctxt->decode;
+
+ switch (c->dst.type) {
+ case OP_REG:
+ /* The 4-byte case *is* correct:
+ * in 64-bit mode we zero-extend.
+ */
+ switch (c->dst.bytes) {
+ case 1:
+ *(uint8_t *)c->dst.ptr = (uint8_t)c->dst.val;
+ break;
+ case 2:
+ *(uint16_t *)c->dst.ptr = (uint16_t)c->dst.val;
+ break;
+ case 4:
+ *c->dst.ptr = (uint32_t)c->dst.val;
+ break; /* 64b: zero-ext */
+ case 8:
+ *c->dst.ptr = c->dst.val;
+ break;
+ }
+ break;
+ case OP_MEM:
+ if (c->lock_prefix)
+ rc = ops->cmpxchg_emulated(
+ (unsigned long)c->dst.ptr,
+ &c->dst.orig_val,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ else
+ rc = ops->write_emulated(
+ (unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, uint32_t mask)
+{
+ uint32_t int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (!(int_shadow & mask))
+ ctxt->interruptibility = mask;
+}
+
+static inline void
+setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
+ struct kvm_segment *cs, struct kvm_segment *ss)
+{
+ memset(cs, 0, sizeof(struct kvm_segment));
+ kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
+ memset(ss, 0, sizeof(struct kvm_segment));
+
+ cs->l = 0; /* will be adjusted later */
+ cs->base = 0; /* flat segment */
+ cs->g = 1; /* 4kb granularity */
+ cs->limit = 0xffffffff; /* 4GB limit */
+ cs->type = 0x0b; /* Read, Execute, Accessed */
+ cs->s = 1;
+ cs->dpl = 0; /* will be adjusted later */
+ cs->present = 1;
+ cs->db = 1;
+
+ ss->unusable = 0;
+ ss->base = 0; /* flat segment */
+ ss->limit = 0xffffffff; /* 4GB limit */
+ ss->g = 1; /* 4kb granularity */
+ ss->s = 1;
+ ss->type = 0x03; /* Read/Write, Accessed */
+ ss->db = 1; /* 32bit stack segment */
+ ss->dpl = 0;
+ ss->present = 1;
+}
+
+extern int is_long_mode(struct kvm_vcpu *vcpu);
+
+static int
+emulate_syscall(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ uint64_t msr_data;
+
+ /* syscall is not available in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
+ return X86EMUL_UNHANDLEABLE;
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ msr_data >>= 32;
+ cs.selector = (uint16_t)(msr_data & 0xfffc);
+ ss.selector = (uint16_t)(msr_data + 8);
+
+ if (is_long_mode(ctxt->vcpu)) {
+ cs.db = 0;
+ cs.l = 1;
+ }
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ c->regs[VCPU_REGS_RCX] = c->eip;
+ if (is_long_mode(ctxt->vcpu)) {
+#ifdef CONFIG_X86_64
+ c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
+ c->eip = msr_data;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ ctxt->eflags &= ~(msr_data | EFLG_RF);
+#endif
+ } else {
+ /* legacy mode */
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ c->eip = (uint32_t)msr_data;
+
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int
+emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ uint64_t msr_data;
+
+ /* inject #GP if in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ /* XXX sysenter/sysexit have not been tested in 64bit mode.
+ * Therefore, we inject an #UD.
+ */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (ctxt->mode) {
+ case X86EMUL_MODE_PROT32:
+ if ((msr_data & 0xfffc) == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ break;
+ case X86EMUL_MODE_PROT64:
+ if (msr_data == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ break;
+ }
+
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ cs.selector = (uint16_t)msr_data;
+ cs.selector &= ~SELECTOR_RPL_MASK;
+ ss.selector = cs.selector + 8;
+ ss.selector &= ~SELECTOR_RPL_MASK;
+ if (ctxt->mode == X86EMUL_MODE_PROT64
+ || is_long_mode(ctxt->vcpu)) {
+ cs.db = 0;
+ cs.l = 1;
+ }
+
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+ c->eip = msr_data;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+ c->regs[VCPU_REGS_RSP] = msr_data;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int
+emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ uint64_t msr_data;
+ int usermode;
+
+ /* inject #GP if in real mode or Virtual 8086 mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ if ((c->rex_prefix & 0x8) != 0x0)
+ usermode = X86EMUL_MODE_PROT64;
+ else
+ usermode = X86EMUL_MODE_PROT32;
+
+ cs.dpl = 3;
+ ss.dpl = 3;
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (usermode) {
+ case X86EMUL_MODE_PROT32:
+ cs.selector = (uint16_t)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ ss.selector = (uint16_t)(msr_data + 24);
+ break;
+ case X86EMUL_MODE_PROT64:
+ cs.selector = (uint16_t)(msr_data + 32);
+ if (msr_data == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ ss.selector = cs.selector + 8;
+ cs.db = 0;
+ cs.l = 1;
+ break;
+ }
+ cs.selector |= SELECTOR_RPL_MASK;
+ ss.selector |= SELECTOR_RPL_MASK;
+
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
+ c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+
+ return X86EMUL_CONTINUE;
+}
+
+static int emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+{
+ int iopl;
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return 0;
+ if (ctxt->mode == X86EMUL_MODE_VM86)
+ return 1;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
+}
+
+static int emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ uint16_t port, uint16_t len)
+{
+ struct kvm_segment tr_seg;
+ int r;
+ uint16_t io_bitmap_ptr;
+ uint8_t perm, bit_idx = port & 0x7;
+ unsigned mask = (1 << len) - 1;
+
+ kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
+ if (tr_seg.unusable)
+ return 0;
+ if (tr_seg.limit < 103)
+ return 0;
+ r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
+ NULL);
+ if (r != X86EMUL_CONTINUE)
+ return 0;
+ if (io_bitmap_ptr + port/8 > tr_seg.limit)
+ return 0;
+ r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
+ ctxt->vcpu, NULL);
+ if (r != X86EMUL_CONTINUE)
+ return 0;
+ if ((perm >> bit_idx) & mask)
+ return 0;
+ return 1;
+}
+
+static int emulator_io_permited(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ uint16_t port, uint16_t len)
+{
+ if (emulator_bad_iopl(ctxt))
+ if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
+ return 0;
+ return 1;
+}
+
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+{
+#ifdef XXX
+ kvm_mmu_invlpg(vcpu, address);
+#endif /*XXX*/
+ return X86EMUL_CONTINUE;
+}
+
+void realmode_lgdt(struct kvm_vcpu *vcpu, uint16_t limit, unsigned long base)
+{
+ struct descriptor_table dt = { limit, base };
+#ifdef XXX
+ kvm_x86_ops->set_gdt(vcpu, &dt);
+#endif /*XXX*/
+}
+
+void realmode_lidt(struct kvm_vcpu *vcpu, uint16_t limit, unsigned long base)
+{
+ struct descriptor_table dt = { limit, base };
+#ifdef XXX
+ kvm_x86_ops->set_idt(vcpu, &dt);
+#endif /*XXX*/
+}
+
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+ unsigned long *rflags)
+{
+#ifdef XXX
+ kvm_lmsw(vcpu, msw);
+#endif /*XXX*/
+ *rflags = kvm_get_rflags(vcpu);
+}
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ char instruction[3];
+ unsigned long rip = kvm_rip_read(vcpu);
+
+ /*
+ * Blow out the MMU to ensure that no other VCPU has an active mapping
+ * to ensure that the updated hypercall appears atomically across all
+ * VCPUs.
+ */
+ kvm_mmu_zap_all(vcpu->kvm);
+
+ kvm_x86_ops->patch_hypercall(vcpu, instruction);
+
+ return emulator_write_emulated(rip, instruction, 3, vcpu);
+#else
+ return 1;
+#endif /*XXX*/
+}
+
+extern ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask);
+extern ulong kvm_read_cr4(struct kvm_vcpu *vcpu);
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
+{
+ unsigned long value;
+
+ switch (cr) {
+ case 0:
+ value = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ value = vcpu->arch.cr2;
+ break;
+ case 3:
+ value = vcpu->arch.cr3;
+ break;
+ case 4:
+ value = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ value = kvm_get_cr8(vcpu);
+ break;
+ default:
+ cmn_err(CE_NOTE, "%s: unexpected cr %u\n", __func__, cr);
+ return 0;
+ }
+
+ return value;
+}
+
+static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
+ unsigned long *rflags)
+{
+ switch (cr) {
+ case 0:
+ kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ *rflags = kvm_get_rflags(vcpu);
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ break;
+ case 8:
+ kvm_set_cr8(vcpu, val & 0xfUL);
+ break;
+ default:
+ cmn_err(CE_NOTE, "%s: unexpected cr %u\n", __func__, cr);
+ }
+}
+
+int
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+ unsigned long memop = 0;
+ uint64_t msr_data;
+ unsigned long saved_eip = 0;
+ struct decode_cache *c = &ctxt->decode;
+ unsigned int port;
+ int io_dir_in;
+ int rc = 0;
+
+ ctxt->interruptibility = 0;
+
+ /* Shadow copy of register state. Committed on successful emulation.
+ * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
+ * modify them.
+ */
+
+ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ saved_eip = c->eip;
+
+ /* LOCK prefix is allowed only with some instructions */
+ if (c->lock_prefix && !(c->d & Lock)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+
+ if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
+ memop = c->modrm_ea;
+
+ if (c->rep_prefix && (c->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (c->regs[VCPU_REGS_RCX] == 0) {
+ kvm_rip_write(ctxt->vcpu, c->eip);
+ goto done;
+ }
+ /* The second termination condition only applies for REPE
+ * and REPNE. Test if the repeat string operation prefix is
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+ * corresponding termination condition according to:
+ * - if REPE/REPZ and ZF = 0 then done
+ * - if REPNE/REPNZ and ZF = 1 then done
+ */
+ if ((c->b == 0xa6) || (c->b == 0xa7) ||
+ (c->b == 0xae) || (c->b == 0xaf)) {
+ if ((c->rep_prefix == REPE_PREFIX) &&
+ ((ctxt->eflags & EFLG_ZF) == 0)) {
+ kvm_rip_write(ctxt->vcpu, c->eip);
+ goto done;
+ }
+ if ((c->rep_prefix == REPNE_PREFIX) &&
+ ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
+ kvm_rip_write(ctxt->vcpu, c->eip);
+ goto done;
+ }
+ }
+ c->regs[VCPU_REGS_RCX]--;
+ c->eip = kvm_rip_read(ctxt->vcpu);
+ }
+
+ if (c->src.type == OP_MEM) {
+ c->src.ptr = (unsigned long *)memop;
+ c->src.val = 0;
+ rc = ops->read_emulated((unsigned long)c->src.ptr,
+ &c->src.val,
+ c->src.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ c->src.orig_val = c->src.val;
+ }
+
+ if ((c->d & DstMask) == ImplicitOps)
+ goto special_insn;
+
+
+ if (c->dst.type == OP_MEM) {
+ c->dst.ptr = (unsigned long *)memop;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.val = 0;
+ if (c->d & BitOp) {
+ unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+ c->dst.ptr = (void *)c->dst.ptr +
+ (c->src.val & mask) / 8;
+ }
+ if (!(c->d & Mov)) {
+ /* optimisation - avoid slow emulated read */
+ rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+ }
+ c->dst.orig_val = c->dst.val;
+
+special_insn:
+
+ if (c->twobyte)
+ goto twobyte_insn;
+
+ switch (c->b) {
+ case 0x00 ... 0x05:
+ add: /* add */
+ emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x06: /* push es */
+ emulate_push_sreg(ctxt, VCPU_SREG_ES);
+ break;
+ case 0x07: /* pop es */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0x08 ... 0x0d:
+ or: /* or */
+ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x0e: /* push cs */
+ emulate_push_sreg(ctxt, VCPU_SREG_CS);
+ break;
+ case 0x10 ... 0x15:
+ adc: /* adc */
+ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x16: /* push ss */
+ emulate_push_sreg(ctxt, VCPU_SREG_SS);
+ break;
+ case 0x17: /* pop ss */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0x18 ... 0x1d:
+ sbb: /* sbb */
+ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x1e: /* push ds */
+ emulate_push_sreg(ctxt, VCPU_SREG_DS);
+ break;
+ case 0x1f: /* pop ds */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0x20 ... 0x25:
+ and: /* and */
+ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x28 ... 0x2d:
+ sub: /* sub */
+ emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x30 ... 0x35:
+ xor: /* xor */
+ emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x38 ... 0x3d:
+ cmp: /* cmp */
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x40 ... 0x47: /* inc r16/r32 */
+ emulate_1op("inc", c->dst, ctxt->eflags);
+ break;
+ case 0x48 ... 0x4f: /* dec r16/r32 */
+ emulate_1op("dec", c->dst, ctxt->eflags);
+ break;
+ case 0x50 ... 0x57: /* push reg */
+ emulate_push(ctxt);
+ break;
+ case 0x58 ... 0x5f: /* pop reg */
+ pop_instruction:
+ rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0x60: /* pusha */
+ emulate_pusha(ctxt);
+ break;
+ case 0x61: /* popa */
+ rc = emulate_popa(ctxt, ops);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0x63: /* movsxd */
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ goto cannot_emulate;
+ c->dst.val = (int32_t) c->src.val;
+ break;
+ case 0x68: /* push imm */
+ case 0x6a: /* push imm8 */
+ emulate_push(ctxt);
+ break;
+ case 0x6c: /* insb */
+ case 0x6d: /* insw/insd */
+ if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+#ifdef XXX
+ if (kvm_emulate_pio_string(ctxt->vcpu,
+ 1,
+ (c->d & ByteOp) ? 1 : c->op_bytes,
+ c->rep_prefix ?
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
+ (ctxt->eflags & EFLG_DF),
+ register_address(c, es_base(ctxt),
+ c->regs[VCPU_REGS_RDI]),
+ c->rep_prefix,
+ c->regs[VCPU_REGS_RDX]) == 0) {
+ c->eip = saved_eip;
+ return -1;
+ }
+#endif /*XXX*/
+ return 0;
+ case 0x6e: /* outsb */
+ case 0x6f: /* outsw/outsd */
+ if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+#ifdef XXX
+ if (kvm_emulate_pio_string(ctxt->vcpu,
+ 0,
+ (c->d & ByteOp) ? 1 : c->op_bytes,
+ c->rep_prefix ?
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
+ (ctxt->eflags & EFLG_DF),
+ register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
+ c->rep_prefix,
+ c->regs[VCPU_REGS_RDX]) == 0) {
+ c->eip = saved_eip;
+ return -1;
+ }
+#endif /*XXX*/
+ return 0;
+ case 0x70 ... 0x7f: /* jcc (short) */
+ if (test_cc(c->b, ctxt->eflags))
+ jmp_rel(c, c->src.val);
+ break;
+ case 0x80 ... 0x83: /* Grp1 */
+ switch (c->modrm_reg) {
+ case 0:
+ goto add;
+ case 1:
+ goto or;
+ case 2:
+ goto adc;
+ case 3:
+ goto sbb;
+ case 4:
+ goto and;
+ case 5:
+ goto sub;
+ case 6:
+ goto xor;
+ case 7:
+ goto cmp;
+ }
+ break;
+ case 0x84 ... 0x85:
+ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x86 ... 0x87: /* xchg */
+ xchg:
+ /* Write back the register source. */
+ switch (c->dst.bytes) {
+ case 1:
+ *(uint8_t *) c->src.ptr = (uint8_t) c->dst.val;
+ break;
+ case 2:
+ *(uint16_t *) c->src.ptr = (uint16_t) c->dst.val;
+ break;
+ case 4:
+ *c->src.ptr = (uint32_t) c->dst.val;
+ break; /* 64b reg: zero-extend */
+ case 8:
+ *c->src.ptr = c->dst.val;
+ break;
+ }
+ /*
+ * Write back the memory destination with implicit LOCK
+ * prefix.
+ */
+ c->dst.val = c->src.val;
+ c->lock_prefix = 1;
+ break;
+ case 0x88 ... 0x8b: /* mov */
+ goto mov;
+ case 0x8c: { /* mov r/m, sreg */
+ struct kvm_segment segreg;
+
+ if (c->modrm_reg <= 5)
+ kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
+ else {
+ cmn_err(CE_NOTE,"0x8c: Invalid segreg in modrm byte 0x%02x\n",
+ c->modrm);
+ goto cannot_emulate;
+ }
+ c->dst.val = segreg.selector;
+ break;
+ }
+ case 0x8d: /* lea r16/r32, m */
+ c->dst.val = c->modrm_ea;
+ break;
+ case 0x8e: { /* mov seg, r/m16 */
+ uint16_t sel;
+
+ sel = c->src.val;
+
+ if (c->modrm_reg == VCPU_SREG_CS ||
+ c->modrm_reg > VCPU_SREG_GS) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+
+ if (c->modrm_reg == VCPU_SREG_SS)
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
+
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ }
+ case 0x8f: /* pop (sole member of Grp1a) */
+ rc = emulate_grp1a(ctxt, ops);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0x90: /* nop / xchg r8,rax */
+ if (!(c->rex_prefix & 1)) { /* nop */
+ c->dst.type = OP_NONE;
+ break;
+ }
+ case 0x91 ... 0x97: /* xchg reg,rax */
+ c->src.type = c->dst.type = OP_REG;
+ c->src.bytes = c->dst.bytes = c->op_bytes;
+ c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
+ c->src.val = *(c->src.ptr);
+ goto xchg;
+ case 0x9c: /* pushf */
+ c->src.val = (unsigned long) ctxt->eflags;
+ emulate_push(ctxt);
+ break;
+ case 0x9d: /* popf */
+ c->dst.type = OP_REG;
+ c->dst.ptr = (unsigned long *) &ctxt->eflags;
+ c->dst.bytes = c->op_bytes;
+ rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ break;
+ case 0xa0 ... 0xa1: /* mov */
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ c->dst.val = c->src.val;
+ break;
+ case 0xa2 ... 0xa3: /* mov */
+ c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
+ break;
+ case 0xa4 ... 0xa5: /* movs */
+ c->dst.type = OP_MEM;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)register_address(c,
+ es_base(ctxt),
+ c->regs[VCPU_REGS_RDI]);
+ rc = ops->read_emulated(register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
+ &c->dst.val,
+ c->dst.bytes, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ break;
+ case 0xa6 ... 0xa7: /* cmps */
+ c->src.type = OP_NONE; /* Disable writeback. */
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.ptr = (unsigned long *)register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]);
+ rc = ops->read_emulated((unsigned long)c->src.ptr,
+ &c->src.val,
+ c->src.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)register_address(c,
+ es_base(ctxt),
+ c->regs[VCPU_REGS_RDI]);
+ rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
+
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
+ (ctxt->eflags & EFLG_DF) ? -c->src.bytes
+ : c->src.bytes);
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+
+ break;
+ case 0xaa ... 0xab: /* stos */
+ c->dst.type = OP_MEM;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)register_address(c,
+ es_base(ctxt),
+ c->regs[VCPU_REGS_RDI]);
+ c->dst.val = c->regs[VCPU_REGS_RAX];
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ break;
+ case 0xac ... 0xad: /* lods */
+ c->dst.type = OP_REG;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ rc = ops->read_emulated(register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ break;
+ case 0xae ... 0xaf: /* scas */
+ DPRINTF("Urk! I don't handle SCAS.\n");
+ goto cannot_emulate;
+ case 0xb0 ... 0xbf: /* mov r, imm */
+ goto mov;
+ case 0xc0 ... 0xc1:
+ emulate_grp2(ctxt);
+ break;
+ case 0xc3: /* ret */
+ c->dst.type = OP_REG;
+ c->dst.ptr = &c->eip;
+ c->dst.bytes = c->op_bytes;
+ goto pop_instruction;
+ case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
+ mov:
+ c->dst.val = c->src.val;
+ break;
+ case 0xcb: /* ret far */
+ rc = emulate_ret_far(ctxt, ops);
+ if (rc)
+ goto done;
+ break;
+ case 0xd0 ... 0xd1: /* Grp2 */
+ c->src.val = 1;
+ emulate_grp2(ctxt);
+ break;
+ case 0xd2 ... 0xd3: /* Grp2 */
+ c->src.val = c->regs[VCPU_REGS_RCX];
+ emulate_grp2(ctxt);
+ break;
+ case 0xe4: /* inb */
+ case 0xe5: /* in */
+ port = c->src.val;
+ io_dir_in = 1;
+ goto do_io;
+ case 0xe6: /* outb */
+ case 0xe7: /* out */
+ port = c->src.val;
+ io_dir_in = 0;
+ goto do_io;
+ case 0xe8: /* call (near) */ {
+ long int rel = c->src.val;
+ c->src.val = (unsigned long) c->eip;
+ jmp_rel(c, rel);
+ emulate_push(ctxt);
+ break;
+ }
+ case 0xe9: /* jmp rel */
+ goto jmp;
+ case 0xea: /* jmp far */
+ if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
+ VCPU_SREG_CS))
+ goto done;
+
+ c->eip = c->src.val;
+ break;
+ case 0xeb:
+ jmp: /* jmp rel short */
+ jmp_rel(c, c->src.val);
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xec: /* in al,dx */
+ case 0xed: /* in (e/r)ax,dx */
+ port = c->regs[VCPU_REGS_RDX];
+ io_dir_in = 1;
+ goto do_io;
+ case 0xee: /* out al,dx */
+ case 0xef: /* out (e/r)ax,dx */
+ port = c->regs[VCPU_REGS_RDX];
+ io_dir_in = 0;
+ do_io:
+ if (!emulator_io_permited(ctxt, ops, port,
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+#ifdef XXX
+ if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
+ (c->d & ByteOp) ? 1 : c->op_bytes,
+ port) != 0) {
+ c->eip = saved_eip;
+ goto cannot_emulate;
+ }
+#else
+ goto cannot_emulate;
+#endif /*XXX*/
+ break;
+ case 0xf4: /* hlt */
+ ctxt->vcpu->arch.halt_request = 1;
+ break;
+ case 0xf5: /* cmc */
+ /* complement carry flag from eflags reg */
+ ctxt->eflags ^= EFLG_CF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xf6 ... 0xf7: /* Grp3 */
+ rc = emulate_grp3(ctxt, ops);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0xf8: /* clc */
+ ctxt->eflags &= ~EFLG_CF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xfa: /* cli */
+ if (emulator_bad_iopl(ctxt))
+ kvm_inject_gp(ctxt->vcpu, 0);
+ else {
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
+ break;
+ case 0xfb: /* sti */
+ if (emulator_bad_iopl(ctxt))
+ kvm_inject_gp(ctxt->vcpu, 0);
+ else {
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
+ ctxt->eflags |= X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
+ break;
+ case 0xfc: /* cld */
+ ctxt->eflags &= ~EFLG_DF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xfd: /* std */
+ ctxt->eflags |= EFLG_DF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xfe ... 0xff: /* Grp4/Grp5 */
+ rc = emulate_grp45(ctxt, ops);
+ if (rc != 0)
+ goto done;
+ break;
+ }
+
+writeback:
+ rc = writeback(ctxt, ops);
+ if (rc != 0)
+ goto done;
+
+ /* Commit shadow register state. */
+ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(ctxt->vcpu, c->eip);
+
+done:
+ if (rc == X86EMUL_UNHANDLEABLE) {
+ c->eip = saved_eip;
+ return -1;
+ }
+ return 0;
+
+twobyte_insn:
+ switch (c->b) {
+ case 0x01: /* lgdt, lidt, lmsw */
+ switch (c->modrm_reg) {
+ uint16_t size;
+ unsigned long address;
+
+ case 0: /* vmcall */
+ if (c->modrm_mod != 3 || c->modrm_rm != 1)
+ goto cannot_emulate;
+
+ rc = kvm_fix_hypercall(ctxt->vcpu);
+ if (rc)
+ goto done;
+
+ /* Let the processor re-execute the fixed hypercall */
+ c->eip = kvm_rip_read(ctxt->vcpu);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ break;
+ case 2: /* lgdt */
+ rc = read_descriptor(ctxt, ops, c->src.ptr,
+ &size, &address, c->op_bytes);
+ if (rc)
+ goto done;
+ realmode_lgdt(ctxt->vcpu, size, address);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ break;
+ case 3: /* lidt/vmmcall */
+ if (c->modrm_mod == 3) {
+ switch (c->modrm_rm) {
+ case 1:
+ rc = kvm_fix_hypercall(ctxt->vcpu);
+ if (rc)
+ goto done;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ } else {
+ rc = read_descriptor(ctxt, ops, c->src.ptr,
+ &size, &address,
+ c->op_bytes);
+ if (rc)
+ goto done;
+ realmode_lidt(ctxt->vcpu, size, address);
+ }
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ break;
+ case 4: /* smsw */
+ c->dst.bytes = 2;
+ c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
+ break;
+ case 6: /* lmsw */
+ realmode_lmsw(ctxt->vcpu, (uint16_t)c->src.val,
+ &ctxt->eflags);
+ c->dst.type = OP_NONE;
+ break;
+ case 7: /* invlpg*/
+ emulate_invlpg(ctxt->vcpu, memop);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ break;
+ case 0x05: /* syscall */
+ rc = emulate_syscall(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ else
+ goto writeback;
+ break;
+ case 0x06:
+#ifdef XXX
+ emulate_clts(ctxt->vcpu);
+#endif /*XXX*/
+ c->dst.type = OP_NONE;
+ break;
+ case 0x08: /* invd */
+ case 0x09: /* wbinvd */
+ case 0x0d: /* GrpP (prefetch) */
+ case 0x18: /* Grp16 (prefetch/nop) */
+ c->dst.type = OP_NONE;
+ break;
+ case 0x20: /* mov cr, reg */
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ c->regs[c->modrm_rm] =
+ realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+ c->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x21: /* mov from dr to reg */
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+#ifdef XXX
+ rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+ if (rc)
+#endif /*XXX*/
+ goto cannot_emulate;
+ c->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x22: /* mov reg, cr */
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ realmode_set_cr(ctxt->vcpu,
+ c->modrm_reg, c->modrm_val, &ctxt->eflags);
+ c->dst.type = OP_NONE;
+ break;
+ case 0x23: /* mov from reg to dr */
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+#ifdef XXX
+ rc = emulator_set_dr(ctxt, c->modrm_reg,
+ c->regs[c->modrm_rm]);
+ if (rc)
+#endif /*XXX*/
+ goto cannot_emulate;
+ c->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x30:
+ /* wrmsr */
+ msr_data = (uint32_t)c->regs[VCPU_REGS_RAX]
+ | ((uint64_t)c->regs[VCPU_REGS_RDX] << 32);
+ rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
+ if (rc) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ c->eip = kvm_rip_read(ctxt->vcpu);
+ }
+ rc = X86EMUL_CONTINUE;
+ c->dst.type = OP_NONE;
+ break;
+ case 0x32:
+ /* rdmsr */
+ rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
+ if (rc) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ c->eip = kvm_rip_read(ctxt->vcpu);
+ } else {
+ c->regs[VCPU_REGS_RAX] = (uint32_t)msr_data;
+ c->regs[VCPU_REGS_RDX] = msr_data >> 32;
+ }
+ rc = X86EMUL_CONTINUE;
+ c->dst.type = OP_NONE;
+ break;
+ case 0x34: /* sysenter */
+ rc = emulate_sysenter(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ else
+ goto writeback;
+ break;
+ case 0x35: /* sysexit */
+ rc = emulate_sysexit(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ else
+ goto writeback;
+ break;
+ case 0x40 ... 0x4f: /* cmov */
+ c->dst.val = c->dst.orig_val = c->src.val;
+ if (!test_cc(c->b, ctxt->eflags))
+ c->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x80 ... 0x8f: /* jnz rel, etc*/
+ if (test_cc(c->b, ctxt->eflags))
+ jmp_rel(c, c->src.val);
+ c->dst.type = OP_NONE;
+ break;
+ case 0xa0: /* push fs */
+ emulate_push_sreg(ctxt, VCPU_SREG_FS);
+ break;
+ case 0xa1: /* pop fs */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0xa3:
+ bt: /* bt */
+ c->dst.type = OP_NONE;
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+ emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xa4: /* shld imm8, r, r/m */
+ case 0xa5: /* shld cl, r, r/m */
+ emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xa8: /* push gs */
+ emulate_push_sreg(ctxt, VCPU_SREG_GS);
+ break;
+ case 0xa9: /* pop gs */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0xab:
+ bts: /* bts */
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+ emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xac: /* shrd imm8, r, r/m */
+ case 0xad: /* shrd cl, r, r/m */
+ emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xae: /* clflush */
+ break;
+ case 0xb0 ... 0xb1: /* cmpxchg */
+ /*
+ * Save real source value, then compare EAX against
+ * destination.
+ */
+ c->src.orig_val = c->src.val;
+ c->src.val = c->regs[VCPU_REGS_RAX];
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+ if (ctxt->eflags & EFLG_ZF) {
+ /* Success: write back to memory. */
+ c->dst.val = c->src.orig_val;
+ } else {
+ /* Failure: write the value we saw to EAX. */
+ c->dst.type = OP_REG;
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ }
+ break;
+ case 0xb3:
+ btr: /* btr */
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+ emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xb6 ... 0xb7: /* movzx */
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = (c->d & ByteOp) ? (uint8_t) c->src.val
+ : (uint16_t) c->src.val;
+ break;
+ case 0xba: /* Grp8 */
+ switch (c->modrm_reg & 3) {
+ case 0:
+ goto bt;
+ case 1:
+ goto bts;
+ case 2:
+ goto btr;
+ case 3:
+ goto btc;
+ }
+ break;
+ case 0xbb:
+ btc: /* btc */
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+ emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xbe ... 0xbf: /* movsx */
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = (c->d & ByteOp) ? (int8_t) c->src.val :
+ (int16_t) c->src.val;
+ break;
+ case 0xc3: /* movnti */
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = (c->op_bytes == 4) ? (uint32_t) c->src.val :
+ (uint64_t) c->src.val;
+ break;
+ case 0xc7: /* Grp9 (cmpxchg8b) */
+ rc = emulate_grp9(ctxt, ops, memop);
+ if (rc != 0)
+ goto done;
+ c->dst.type = OP_NONE;
+ break;
+ }
+ goto writeback;
+
+cannot_emulate:
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ c->eip = saved_eip;
+ return -1;
+}
diff --git a/hyperv.h b/hyperv.h
new file mode 100644
index 0000000..3053b0d
--- /dev/null
+++ b/hyperv.h
@@ -0,0 +1,184 @@
+#ifndef _ASM_X86_KVM_HYPERV_H
+#define _ASM_X86_KVM_HYPERV_H
+
+/*
+ * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
+ * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
+ */
+#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
+#define HYPERV_CPUID_INTERFACE 0x40000001
+#define HYPERV_CPUID_VERSION 0x40000002
+#define HYPERV_CPUID_FEATURES 0x40000003
+#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
+#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+
+/*
+ * Feature identification. EAX indicates which features are available
+ * to the partition based upon the current partition privileges.
+ */
+
+/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
+#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
+/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
+#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
+/*
+ * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
+ * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
+ */
+#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
+/*
+ * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
+ * HV_X64_MSR_STIMER3_COUNT) available
+ */
+#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
+/*
+ * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
+ * are available
+ */
+#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
+/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
+#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
+/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
+#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
+/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
+#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
+ /*
+ * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
+ */
+#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+
+/*
+ * Feature identification: EBX indicates which flags were specified at
+ * partition creation. The format is the same as the partition creation
+ * flag structure defined in section Partition Creation Flags.
+ */
+#define HV_X64_CREATE_PARTITIONS (1 << 0)
+#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
+#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
+#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
+#define HV_X64_POST_MESSAGES (1 << 4)
+#define HV_X64_SIGNAL_EVENTS (1 << 5)
+#define HV_X64_CREATE_PORT (1 << 6)
+#define HV_X64_CONNECT_PORT (1 << 7)
+#define HV_X64_ACCESS_STATS (1 << 8)
+#define HV_X64_DEBUGGING (1 << 11)
+#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
+#define HV_X64_CONFIGURE_PROFILER (1 << 13)
+
+/*
+ * Feature identification. EDX indicates which miscellaneous features
+ * are available to the partition.
+ */
+/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
+#define HV_X64_MWAIT_AVAILABLE (1 << 0)
+/* Guest debugging support is available */
+#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
+/* Performance Monitor support is available*/
+#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
+/* Support for physical CPU dynamic partitioning events is available*/
+#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
+/*
+ * Support for passing hypercall input parameter block via XMM
+ * registers is available
+ */
+#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
+/* Support for a virtual guest idle state is available */
+#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+
+/*
+ * Implementation recommendations. Indicates which behaviors the hypervisor
+ * recommends the OS implement for optimal performance.
+ */
+ /*
+ * Recommend using hypercall for address space switches rather
+ * than MOV to CR3 instruction
+ */
+#define HV_X64_MWAIT_RECOMMENDED (1 << 0)
+/* Recommend using hypercall for local TLB flushes rather
+ * than INVLPG or MOV to CR3 instructions */
+#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
+/*
+ * Recommend using hypercall for remote TLB flushes rather
+ * than inter-processor interrupts
+ */
+#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
+/*
+ * Recommend using MSRs for accessing APIC registers
+ * EOI, ICR and TPR rather than their memory-mapped counterparts
+ */
+#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
+/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
+#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
+/*
+ * Recommend using relaxed timing for this partition. If used,
+ * the VM should disable any watchdog timeouts that rely on the
+ * timely delivery of external interrupts
+ */
+#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
+
+/* MSR used to identify the guest OS. */
+#define HV_X64_MSR_GUEST_OS_ID 0x40000000
+
+/* MSR used to setup pages used to communicate with the hypervisor. */
+#define HV_X64_MSR_HYPERCALL 0x40000001
+
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX 0x40000002
+
+/* Define the virtual APIC registers */
+#define HV_X64_MSR_EOI 0x40000070
+#define HV_X64_MSR_ICR 0x40000071
+#define HV_X64_MSR_TPR 0x40000072
+#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
+
+/* Define synthetic interrupt controller model specific registers. */
+#define HV_X64_MSR_SCONTROL 0x40000080
+#define HV_X64_MSR_SVERSION 0x40000081
+#define HV_X64_MSR_SIEFP 0x40000082
+#define HV_X64_MSR_SIMP 0x40000083
+#define HV_X64_MSR_EOM 0x40000084
+#define HV_X64_MSR_SINT0 0x40000090
+#define HV_X64_MSR_SINT1 0x40000091
+#define HV_X64_MSR_SINT2 0x40000092
+#define HV_X64_MSR_SINT3 0x40000093
+#define HV_X64_MSR_SINT4 0x40000094
+#define HV_X64_MSR_SINT5 0x40000095
+#define HV_X64_MSR_SINT6 0x40000096
+#define HV_X64_MSR_SINT7 0x40000097
+#define HV_X64_MSR_SINT8 0x40000098
+#define HV_X64_MSR_SINT9 0x40000099
+#define HV_X64_MSR_SINT10 0x4000009A
+#define HV_X64_MSR_SINT11 0x4000009B
+#define HV_X64_MSR_SINT12 0x4000009C
+#define HV_X64_MSR_SINT13 0x4000009D
+#define HV_X64_MSR_SINT14 0x4000009E
+#define HV_X64_MSR_SINT15 0x4000009F
+
+
+#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
+
+/* Declare the various hypercall operations. */
+#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
+
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+#define HV_PROCESSOR_POWER_STATE_C0 0
+#define HV_PROCESSOR_POWER_STATE_C1 1
+#define HV_PROCESSOR_POWER_STATE_C2 2
+#define HV_PROCESSOR_POWER_STATE_C3 3
+
+/* hypercall status code */
+#define HV_STATUS_SUCCESS 0
+#define HV_STATUS_INVALID_HYPERCALL_CODE 2
+#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
+#define HV_STATUS_INVALID_ALIGNMENT 4
+
+#endif
diff --git a/iodev.h b/iodev.h
new file mode 100644
index 0000000..0811c47
--- /dev/null
+++ b/iodev.h
@@ -0,0 +1,69 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __KVM_IODEV_H__
+#define __KVM_IODEV_H__
+
+#include "kvm_types.h"
+
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+ int (*read)(struct kvm_io_device *this,
+ gpa_t addr,
+ int len,
+ void *val);
+ int (*write)(struct kvm_io_device *this,
+ gpa_t addr,
+ int len,
+ const void *val);
+ void (*destructor)(struct kvm_io_device *this);
+};
+
+
+struct kvm_io_device {
+ const struct kvm_io_device_ops *ops;
+};
+
+static inline void kvm_iodevice_init(struct kvm_io_device *dev,
+ const struct kvm_io_device_ops *ops)
+{
+ dev->ops = ops;
+}
+
+static inline int kvm_iodevice_read(struct kvm_io_device *dev,
+ gpa_t addr, int l, void *v)
+{
+ return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
+}
+
+static inline int kvm_iodevice_write(struct kvm_io_device *dev,
+ gpa_t addr, int l, const void *v)
+{
+ return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+ if (dev->ops->destructor)
+ dev->ops->destructor(dev);
+}
+
+#endif /* __KVM_IODEV_H__ */
diff --git a/irq.h b/irq.h
new file mode 100644
index 0000000..3682235
--- /dev/null
+++ b/irq.h
@@ -0,0 +1,103 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#ifdef XXX
+#include <linux/mm_types.h>
+#include <linux/hrtimer.h>
+#include <linux/kvm_host.h>
+#include <linux/spinlock.h>
+
+#include "iodev.h"
+#include "ioapic.h"
+#include "lapic.h"
+#endif /*XXX*/
+
+#define PIC_NUM_PINS 16
+#define SELECT_PIC(irq) \
+ ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
+
+struct kvm;
+struct kvm_vcpu;
+
+typedef void irq_request_func(void *opaque, int level);
+
+struct kvm_kpic_state {
+ uint8_t last_irr; /* edge detection */
+ uint8_t irr; /* interrupt request register */
+ uint8_t imr; /* interrupt mask register */
+ uint8_t isr; /* interrupt service register */
+ uint8_t isr_ack; /* interrupt ack detection */
+ uint8_t priority_add; /* highest irq priority */
+ uint8_t irq_base;
+ uint8_t read_reg_select;
+ uint8_t poll;
+ uint8_t special_mask;
+ uint8_t init_state;
+ uint8_t auto_eoi;
+ uint8_t rotate_on_auto_eoi;
+ uint8_t special_fully_nested_mode;
+ uint8_t init4; /* true if 4 byte init */
+ uint8_t elcr; /* PIIX edge/trigger selection */
+ uint8_t elcr_mask;
+ struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+ kmutex_t lock;
+ unsigned pending_acks;
+ struct kvm *kvm;
+ struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+ irq_request_func *irq_request;
+ void *irq_request_opaque;
+ int output; /* intr from master PIC */
+ struct kvm_io_device dev;
+ void (*ack_notifier)(void *opaque, int irq);
+ unsigned long irq_states[16];
+};
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_destroy_pic(struct kvm *kvm);
+int kvm_pic_read_irq(struct kvm *kvm);
+void kvm_pic_update_irq(struct kvm_pic *s);
+void kvm_pic_clear_isr_ack(struct kvm *kvm);
+
+static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+{
+ return kvm->arch.vpic;
+}
+
+
+void kvm_pic_reset(struct kvm_kpic_state *s);
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
+
+int pit_has_pending_timer(struct kvm_vcpu *vcpu);
+int apic_has_pending_timer(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/kvm_emulate.h b/kvm_emulate.h
new file mode 100644
index 0000000..be9bd75
--- /dev/null
+++ b/kvm_emulate.h
@@ -0,0 +1,198 @@
+/******************************************************************************
+ * x86_emulate.h
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef _ASM_X86_KVM_X86_EMULATE_H
+#define _ASM_X86_KVM_X86_EMULATE_H
+
+struct x86_emulate_ctxt;
+
+/*
+ * x86_emulate_ops:
+ *
+ * These operations represent the instruction emulator's interface to memory.
+ * There are two categories of operation: those that act on ordinary memory
+ * regions (*_std), and those that act on memory regions known to require
+ * special treatment or emulation (*_emulated).
+ *
+ * The emulator assumes that an instruction accesses only one 'emulated memory'
+ * location, that this location is the given linear faulting address (cr2), and
+ * that this is one of the instruction's data operands. Instruction fetches and
+ * stack operations are assumed never to access emulated memory. The emulator
+ * automatically deduces which operand of a string-move operation is accessing
+ * emulated memory, and assumes that the other operand accesses normal memory.
+ *
+ * NOTES:
+ * 1. The emulator isn't very smart about emulated vs. standard memory.
+ * 'Emulated memory' access addresses should be checked for sanity.
+ * 'Normal memory' accesses may fault, and the caller must arrange to
+ * detect and handle reentrancy into the emulator via recursive faults.
+ * Accesses may be unaligned and may cross page boundaries.
+ * 2. If the access fails (cannot emulate, or a standard access faults) then
+ * it is up to the memop to propagate the fault to the guest VM via
+ * some out-of-band mechanism, unknown to the emulator. The memop signals
+ * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
+ * then immediately bail.
+ * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
+ * cmpxchg8b_emulated need support 8-byte accesses.
+ * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ */
+/* Access completed successfully: continue emulation as normal. */
+#define X86EMUL_CONTINUE 0
+/* Access is unhandleable: bail from emulation and return error to caller. */
+#define X86EMUL_UNHANDLEABLE 1
+/* Terminate emulation but return success to the caller. */
+#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
+#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
+struct x86_emulate_ops {
+ /*
+ * read_std: Read bytes of standard (non-emulated/special) memory.
+ * Used for descriptor reading.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_std)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu, uint32_t *error);
+
+ /*
+ * fetch: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*fetch)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu, uint32_t *error);
+
+ /*
+ * read_emulated: Read bytes from emulated/special memory area.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_emulated)(unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+ /*
+ * write_emulated: Write bytes to emulated/special memory area.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [IN ] Value to write to memory (low-order bytes used as
+ * required).
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_emulated)(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+ /*
+ * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
+ * emulated/special memory area.
+ * @addr: [IN ] Linear address to access.
+ * @old: [IN ] Value expected to be current at @addr.
+ * @new: [IN ] Value to write to @addr.
+ * @bytes: [IN ] Number of bytes to access using CMPXCHG.
+ */
+ int (*cmpxchg_emulated)(unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+ enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+ unsigned int bytes;
+ unsigned long val, orig_val, *ptr;
+};
+
+struct fetch_cache {
+ uint8_t data[15];
+ unsigned long start;
+ unsigned long end;
+};
+
+struct decode_cache {
+ uint8_t twobyte;
+ uint8_t b;
+ uint8_t lock_prefix;
+ uint8_t rep_prefix;
+ uint8_t op_bytes;
+ uint8_t ad_bytes;
+ uint8_t rex_prefix;
+ struct operand src;
+ struct operand src2;
+ struct operand dst;
+ int has_seg_override;
+ uint8_t seg_override;
+ unsigned int d;
+ unsigned long regs[NR_VCPU_REGS];
+ unsigned long eip, eip_orig;
+ /* modrm */
+ uint8_t modrm;
+ uint8_t modrm_mod;
+ uint8_t modrm_reg;
+ uint8_t modrm_rm;
+ uint8_t use_modrm_ea;
+ int rip_relative;
+ unsigned long modrm_ea;
+ void *modrm_ptr;
+ unsigned long modrm_val;
+ struct fetch_cache fetch;
+};
+
+#define X86_SHADOW_INT_MOV_SS 1
+#define X86_SHADOW_INT_STI 2
+
+struct x86_emulate_ctxt {
+ /* Register state before/after emulation. */
+ struct kvm_vcpu *vcpu;
+
+ unsigned long eflags;
+ /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+ int mode;
+ uint32_t cs_base;
+
+ /* interruptibility state, as a result of execution of STI or MOV SS */
+ int interruptibility;
+
+ /* decode cache */
+ struct decode_cache decode;
+};
+
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX 1
+#define REPNE_PREFIX 2
+
+/* Execution mode, passed to the emulator. */
+#define X86EMUL_MODE_REAL 0 /* Real mode. */
+#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
+#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
+#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
+#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
+
+/* Host execution mode. */
+#if defined(CONFIG_X86_32)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
+#elif defined(CONFIG_X86_64)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
+#endif
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops);
+
+#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/kvm_para.h b/kvm_para.h
new file mode 100644
index 0000000..bc82b95
--- /dev/null
+++ b/kvm_para.h
@@ -0,0 +1,40 @@
+#ifndef __LINUX_KVM_PARA_H
+#define __LINUX_KVM_PARA_H
+
+/*
+ * This header file provides a method for making a hypercall to the host
+ * Architectures should define:
+ * - kvm_hypercall0, kvm_hypercall1...
+ * - kvm_arch_para_features
+ * - kvm_para_available
+ */
+
+/* Return values for hypercalls */
+#define KVM_ENOSYS 1000
+#define KVM_EFAULT EFAULT
+#define KVM_E2BIG E2BIG
+#define KVM_EPERM EPERM
+
+#define KVM_HC_VAPIC_POLL_IRQ 1
+#define KVM_HC_MMU_OP 2
+
+/*
+ * hypercalls use architecture specific
+ */
+
+#ifdef _KERNEL
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
+#endif
+
+static inline int kvm_para_has_feature(unsigned int feature)
+{
+ if (kvm_arch_para_features() & (1UL << feature))
+ return 1;
+ return 0;
+}
+#endif /* _KERNEL */
+#endif /* __LINUX_KVM_PARA_H */
+
diff --git a/kvm_types.h b/kvm_types.h
new file mode 100644
index 0000000..3901090
--- /dev/null
+++ b/kvm_types.h
@@ -0,0 +1,67 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef __KVM_TYPES_H__
+#define __KVM_TYPES_H__
+
+
+/*
+ * Address types:
+ *
+ * gva - guest virtual address
+ * gpa - guest physical address
+ * gfn - guest frame number
+ * hva - host virtual address
+ * hpa - host physical address
+ * hfn - host frame number
+ */
+
+typedef unsigned long gva_t;
+typedef uint64_t gpa_t;
+typedef unsigned long gfn_t;
+
+typedef unsigned long hva_t;
+typedef uint64_t hpa_t;
+typedef unsigned long hfn_t;
+
+union kvm_ioapic_redirect_entry {
+ uint64_t bits;
+ struct {
+ uint8_t vector;
+ uint8_t delivery_mode:3;
+ uint8_t dest_mode:1;
+ uint8_t delivery_status:1;
+ uint8_t polarity:1;
+ uint8_t remote_irr:1;
+ uint8_t trig_mode:1;
+ uint8_t mask:1;
+ uint8_t reserve:7;
+ uint8_t reserved[4];
+ uint8_t dest_id;
+ } fields;
+};
+
+struct kvm_lapic_irq {
+ uint32_t vector;
+ uint32_t delivery_mode;
+ uint32_t dest_mode;
+ uint32_t level;
+ uint32_t trig_mode;
+ uint32_t shorthand;
+ uint32_t dest_id;
+};
+
+#endif /* __KVM_TYPES_H__ */
diff --git a/kvm_x86host.h b/kvm_x86host.h
new file mode 100644
index 0000000..9fe6cb7
--- /dev/null
+++ b/kvm_x86host.h
@@ -0,0 +1,964 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This header defines architecture specific interfaces, x86 version
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _ASM_X86_KVM_HOST_H
+#define _ASM_X86_KVM_HOST_H
+
+#ifdef XXX
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mmu_notifier.h>
+#include <linux/tracepoint.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/pvclock-abi.h>
+#include <asm/desc.h>
+#include <asm/mtrr.h>
+#include <asm/msr-index.h>
+
+#endif
+
+#include "kvm_types.h"
+
+#ifdef XXX
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#endif /*XXX*/
+
+#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
+#define CR3_NONPAE_RESERVED_BITS ((PAGESIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
+ 0xFFFFFF0000000000ULL)
+
+#define INVALID_PAGE (~(hpa_t)0)
+#define UNMAPPED_GVA (~(gpa_t)0)
+
+/* KVM Hugepage definitions for x86 */
+#define KVM_NR_PAGE_SIZES 3
+#define KVM_HPAGE_SHIFT(x) (PAGESHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGESIZE)
+
+#define DE_VECTOR 0
+#define DB_VECTOR 1
+#define BP_VECTOR 3
+#define OF_VECTOR 4
+#define BR_VECTOR 5
+#define UD_VECTOR 6
+#define NM_VECTOR 7
+#define DF_VECTOR 8
+#define TS_VECTOR 10
+#define NP_VECTOR 11
+#define SS_VECTOR 12
+#define GP_VECTOR 13
+#define PF_VECTOR 14
+#define MF_VECTOR 16
+#define MC_VECTOR 18
+
+#define SELECTOR_TI_MASK (1 << 2)
+#define SELECTOR_RPL_MASK 0x03
+
+#define IOPL_SHIFT 12
+
+#define KVM_ALIAS_SLOTS 4
+
+#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_MMU_HASH_SHIFT 10
+#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_NR_FIXED_MTRR_REGION 88
+#define KVM_NR_VAR_MTRR 8
+
+extern kmutex_t kvm_lock;
+extern list_t vm_list;
+
+struct kvm_vcpu;
+struct kvm;
+
+
+enum {
+ VCPU_SREG_ES,
+ VCPU_SREG_CS,
+ VCPU_SREG_SS,
+ VCPU_SREG_DS,
+ VCPU_SREG_FS,
+ VCPU_SREG_GS,
+ VCPU_SREG_TR,
+ VCPU_SREG_LDTR,
+};
+
+enum kvm_reg {
+ VCPU_REGS_RAX = 0,
+ VCPU_REGS_RCX = 1,
+ VCPU_REGS_RDX = 2,
+ VCPU_REGS_RBX = 3,
+ VCPU_REGS_RSP = 4,
+ VCPU_REGS_RBP = 5,
+ VCPU_REGS_RSI = 6,
+ VCPU_REGS_RDI = 7,
+#ifdef CONFIG_X86_64
+ VCPU_REGS_R8 = 8,
+ VCPU_REGS_R9 = 9,
+ VCPU_REGS_R10 = 10,
+ VCPU_REGS_R11 = 11,
+ VCPU_REGS_R12 = 12,
+ VCPU_REGS_R13 = 13,
+ VCPU_REGS_R14 = 14,
+ VCPU_REGS_R15 = 15,
+#endif
+ VCPU_REGS_RIP,
+ NR_VCPU_REGS
+};
+
+enum kvm_reg_ex {
+ VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+};
+
+#include "kvm_emulate.h"
+
+#define KVM_NR_MEM_OBJS 40
+
+#define KVM_NR_DB_REGS 4
+
+#define DR6_BD (1 << 13)
+#define DR6_BS (1 << 14)
+#define DR6_FIXED_1 0xffff0ff0
+#define DR6_VOLATILE 0x0000e00f
+
+#define DR7_BP_EN_MASK 0x000000ff
+#define DR7_GE (1 << 9)
+#define DR7_GD (1 << 13)
+#define DR7_FIXED_1 0x00000400
+#define DR7_VOLATILE 0xffff23ff
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+#define KVM_NR_MEM_OBJS 40
+
+struct kvm_mmu_memory_cache {
+ int nobjs;
+ void *objects[KVM_NR_MEM_OBJS];
+};
+
+#define NR_PTE_CHAIN_ENTRIES 5
+
+struct kvm_pte_chain {
+ uint64_t *parent_ptes[NR_PTE_CHAIN_ENTRIES];
+ list_t link;
+};
+
+/*
+ * kvm_mmu_page_role, below, is defined as:
+ *
+ * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
+ * bits 4:7 - page table level for this shadow (1-4)
+ * bits 8:9 - page table quadrant for 2-level guests
+ * bit 16 - direct mapping of virtual to physical mapping at gfn
+ * used for real mode and two-dimensional paging
+ * bits 17:19 - common access permissions for all ptes in this shadow page
+ */
+union kvm_mmu_page_role {
+ unsigned word;
+ struct {
+ unsigned glevels:4;
+ unsigned level:4;
+ unsigned quadrant:2;
+ unsigned pad_for_nice_hex_output:6;
+ unsigned direct:1;
+ unsigned access:3;
+ unsigned invalid:1;
+ unsigned cr4_pge:1;
+ unsigned nxe:1;
+ };
+};
+
+struct kvm_mmu_page {
+ list_t link;
+ list_t hash_link;
+
+ list_t oos_link;
+
+ hpa_t kvm__mmu_page_hpa;
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ gfn_t gfn;
+ union kvm_mmu_page_role role;
+
+ uint64_t *spt;
+ /* hold the gfn of each spte inside spt */
+ gfn_t *gfns;
+ /*
+ * One bit set per slot which has memory
+ * in this shadow page.
+ */
+ unsigned long slot_bitmap[BT_BITOUL(KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)];
+ int multimapped; /* More than one parent_pte? */
+ int root_count; /* Currently serving as active root */
+ int unsync;
+ unsigned int unsync_children;
+ union {
+ uint64_t *parent_pte; /* !multimapped */
+ list_t parent_ptes; /* multimapped, kvm_pte_chain */
+ };
+ unsigned long unsync_child_bitmap[BT_BITOUL(512)];
+};
+
+struct kvm_pv_mmu_op_buffer {
+ void *ptr;
+ unsigned len;
+ unsigned processed;
+ char pad[2];
+ char buf[512];
+};
+
+struct kvm_pio_request {
+ unsigned long count;
+ int cur_count;
+ gva_t guest_gva;
+ int in;
+ int port;
+ int size;
+ int string;
+ int down;
+ int rep;
+};
+
+/*
+ * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
+ * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
+ * mode.
+ */
+struct kvm_mmu {
+ void (*new_cr3)(struct kvm_vcpu *vcpu);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, uint32_t err);
+ void (*free)(struct kvm_vcpu *vcpu);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, uint32_t access,
+ uint32_t *error);
+ void (*prefetch_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page);
+ int (*sync_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp);
+ void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
+ hpa_t root_hpa;
+ int root_level;
+ int shadow_root_level;
+ union kvm_mmu_page_role base_role;
+
+ uint64_t *pae_root;
+ uint64_t rsvd_bits_mask[2][4];
+};
+
+struct kvm_cpuid_entry2 {
+ uint32_t function;
+ uint32_t index;
+ uint32_t flags;
+ uint32_t eax;
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+ uint32_t padding[3];
+};
+
+struct x86_emulate_ctxt;
+
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+ an 8 bit field: */
+typedef unsigned char mtrr_type;
+
+#define MTRR_NUM_FIXED_RANGES 88
+#define MTRR_MAX_VAR_RANGES 256
+
+struct mtrr_var_range {
+ uint32_t base_lo;
+ uint32_t base_hi;
+ uint32_t mask_lo;
+ uint32_t mask_hi;
+};
+
+struct mtrr_state_type {
+ struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
+ mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
+ unsigned char enabled;
+ unsigned char have_fixed;
+ mtrr_type def_type;
+};
+
+struct kvm_vcpu_arch {
+ uint64_t host_tsc;
+ /*
+ * rip and regs accesses must go through
+ * kvm_{register,rip}_{read,write} functions.
+ */
+ unsigned long regs[NR_VCPU_REGS];
+ uint32_t regs_avail;
+ uint32_t regs_dirty;
+
+ unsigned long cr0;
+ unsigned long cr0_guest_owned_bits;
+ unsigned long cr2;
+ unsigned long cr3;
+ unsigned long cr4;
+ unsigned long cr4_guest_owned_bits;
+ unsigned long cr8;
+ uint32_t hflags;
+ uint64_t pdptrs[4]; /* pae */
+ uint64_t efer;
+ uint64_t apic_base;
+ struct kvm_lapic *apic; /* kernel irqchip context */
+ int32_t apic_arb_prio;
+ int mp_state;
+ int sipi_vector;
+ uint64_t ia32_misc_enable_msr;
+ int tpr_access_reporting;
+
+ struct kvm_mmu mmu;
+ /* only needed in kvm_pv_mmu_op() path, but it's hot so
+ * put it here to avoid allocation */
+ struct kvm_pv_mmu_op_buffer mmu_op_buffer;
+
+ struct kvm_mmu_memory_cache mmu_pte_chain_cache;
+ struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+ struct kvm_mmu_memory_cache mmu_page_cache;
+ struct kvm_mmu_memory_cache mmu_page_header_cache;
+
+ gfn_t last_pt_write_gfn;
+ int last_pt_write_count;
+ uint64_t *last_pte_updated;
+ gfn_t last_pte_gfn;
+
+ struct {
+ gfn_t gfn; /* presumed gfn during guest pte update */
+ pfn_t pfn; /* pfn corresponding to that gfn */
+ unsigned long mmu_seq;
+ } update_pte;
+
+#ifdef XXX
+ struct i387_fxsave_struct host_fx_image;
+ struct i387_fxsave_struct guest_fx_image;
+#endif /*XXX*/
+
+ gva_t mmio_fault_cr2;
+ struct kvm_pio_request pio;
+ void *pio_data;
+
+ uint8_t event_exit_inst_len;
+
+ struct kvm_queued_exception {
+ int pending;
+ int has_error_code;
+ uint8_t nr;
+ uint32_t error_code;
+ } exception;
+
+ struct kvm_queued_interrupt {
+ int pending;
+ int soft;
+ uint8_t nr;
+ } interrupt;
+
+ int halt_request; /* real mode on Intel only */
+
+ int cpuid_nent;
+ struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+ /* emulate context */
+
+ struct x86_emulate_ctxt emulate_ctxt;
+
+ gpa_t time;
+#ifdef XXX
+ struct pvclock_vcpu_time_info hv_clock;
+#endif /*XXX*/
+ unsigned int hv_clock_tsc_khz;
+ unsigned int time_offset;
+ struct page *time_page;
+
+ int nmi_pending;
+ int nmi_injected;
+
+ struct mtrr_state_type mtrr_state;
+ uint32_t pat;
+
+ int switch_db_regs;
+ unsigned long db[KVM_NR_DB_REGS];
+ unsigned long dr6;
+ unsigned long dr7;
+ unsigned long eff_db[KVM_NR_DB_REGS];
+
+ uint64_t mcg_cap;
+ uint64_t mcg_status;
+ uint64_t mcg_ctl;
+ uint64_t *mce_banks;
+
+ /* used for guest single stepping over the given code position */
+ uint16_t singlestep_cs;
+ unsigned long singlestep_rip;
+ /* fields used by HYPER-V emulation */
+ uint64_t hv_vapic;
+};
+
+struct kvm_mem_alias {
+ gfn_t base_gfn;
+ unsigned long npages;
+ gfn_t target_gfn;
+#define KVM_ALIAS_INVALID 1UL
+ unsigned long flags;
+};
+
+#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+
+struct kvm_mem_aliases {
+ struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
+ int naliases;
+};
+
+struct kvm_xen_hvm_config {
+ uint32_t flags;
+ uint32_t msr;
+ uint64_t blob_addr_32;
+ uint64_t blob_addr_64;
+ unsigned char blob_size_32;
+ unsigned char blob_size_64;
+ unsigned char pad2[30];
+};
+
+
+
+struct kvm_arch {
+ struct kvm_mem_aliases *aliases;
+
+ unsigned int n_free_mmu_pages;
+ unsigned int n_requested_mmu_pages;
+ unsigned int n_alloc_mmu_pages;
+ list_t mmu_page_hash[KVM_NUM_MMU_PAGES];
+ /*
+ * Hash table of struct kvm_mmu_page.
+ */
+ list_t active_mmu_pages;
+ list_t assigned_dev_head;
+ struct iommu_domain *iommu_domain;
+ int iommu_flags;
+ struct kvm_pic *vpic;
+ struct kvm_ioapic *vioapic;
+ struct kvm_pit *vpit;
+ int vapics_in_nmi_mode;
+
+ unsigned int tss_addr;
+ struct page *apic_access_page;
+
+ gpa_t wall_clock;
+
+ struct page *ept_identity_pagetable;
+ int ept_identity_pagetable_done;
+ gpa_t ept_identity_map_addr;
+
+ unsigned long irq_sources_bitmap;
+ uint64_t vm_init_tsc;
+ int64_t kvmclock_offset;
+
+ struct kvm_xen_hvm_config xen_hvm_config;
+
+ /* fields used by HYPER-V emulation */
+ uint64_t hv_guest_os_id;
+ uint64_t hv_hypercall;
+};
+
+struct kvm_vm_stat {
+ uint32_t mmu_shadow_zapped;
+ uint32_t mmu_pte_write;
+ uint32_t mmu_pte_updated;
+ uint32_t mmu_pde_zapped;
+ uint32_t mmu_flooded;
+ uint32_t mmu_recycled;
+ uint32_t mmu_cache_miss;
+ uint32_t mmu_unsync;
+ uint32_t remote_tlb_flush;
+ uint32_t lpages;
+};
+
+struct kvm_vcpu_stat {
+ uint32_t pf_fixed;
+ uint32_t pf_guest;
+ uint32_t tlb_flush;
+ uint32_t invlpg;
+
+ uint32_t exits;
+ uint32_t io_exits;
+ uint32_t mmio_exits;
+ uint32_t signal_exits;
+ uint32_t irq_window_exits;
+ uint32_t nmi_window_exits;
+ uint32_t halt_exits;
+ uint32_t halt_wakeup;
+ uint32_t request_irq_exits;
+ uint32_t irq_exits;
+ uint32_t host_state_reload;
+ uint32_t efer_reload;
+ uint32_t fpu_reload;
+ uint32_t insn_emulation;
+ uint32_t insn_emulation_fail;
+ uint32_t hypercalls;
+ uint32_t irq_injections;
+ uint32_t nmi_injections;
+};
+
+#define KVM_GUESTDBG_USE_SW_BP 0x00010000
+#define KVM_GUESTDBG_USE_HW_BP 0x00020000
+#define KVM_GUESTDBG_INJECT_DB 0x00040000
+#define KVM_GUESTDBG_INJECT_BP 0x00080000
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+ uint64_t debugreg[8];
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+
+#define KVM_GUESTDBG_ENABLE 0x00000001
+#define KVM_GUESTDBG_SINGLESTEP 0x00000002
+
+struct kvm_guest_debug {
+ uint32_t control;
+ uint32_t pad;
+ struct kvm_guest_debug_arch arch;
+};
+
+struct kvm_segment {
+ uint64_t base;
+ uint32_t limit;
+ unsigned short selector;
+ unsigned char type;
+ unsigned char present, dpl, db, s, l, g, avl;
+ unsigned char unusable;
+ unsigned char padding;
+};
+
+struct descriptor_table {
+ unsigned short limit;
+ unsigned long base;
+} __attribute__((packed));
+
+struct kvm_x86_ops {
+ int (*cpu_has_kvm_support)(void); /* __init */
+ int (*disabled_by_bios)(void); /* __init */
+ int (*hardware_enable)(void *dummy);
+ void (*hardware_disable)(void *dummy);
+ void (*check_processor_compatibility)(void *rtn);
+ int (*hardware_setup)(void); /* __init */
+ void (*hardware_unsetup)(void); /* __exit */
+ int (*cpu_has_accelerated_tpr)(void);
+ void (*cpuid_update)(struct kvm_vcpu *vcpu);
+
+ /* Create, but do not attach this VCPU */
+ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
+ void (*vcpu_free)(struct kvm_vcpu *vcpu);
+ int (*vcpu_reset)(struct kvm_vcpu *vcpu);
+
+ void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+ void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+ void (*vcpu_put)(struct kvm_vcpu *vcpu);
+
+ void (*set_guest_debug)(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg);
+ int (*get_msr)(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata);
+ int (*set_msr)(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data);
+ uint64_t (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
+ void (*get_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ int (*get_cpl)(struct kvm_vcpu *vcpu);
+ void (*set_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+ void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ void (*set_efer)(struct kvm_vcpu *vcpu, uint64_t efer);
+ void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
+ int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
+ void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+ unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+ void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ void (*fpu_activate)(struct kvm_vcpu *vcpu);
+ void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
+
+ void (*tlb_flush)(struct kvm_vcpu *vcpu);
+
+ void (*run)(struct kvm_vcpu *vcpu);
+ int (*handle_exit)(struct kvm_vcpu *vcpu);
+ void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ uint32_t (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ void (*patch_hypercall)(struct kvm_vcpu *vcpu,
+ unsigned char *hypercall_addr);
+ void (*set_irq)(struct kvm_vcpu *vcpu);
+ void (*set_nmi)(struct kvm_vcpu *vcpu);
+ void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
+ int has_error_code, uint32_t error_code);
+ int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
+ int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+ int (*get_nmi_mask)(struct kvm_vcpu *vcpu);
+ void (*set_nmi_mask)(struct kvm_vcpu *vcpu, int masked);
+ void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+ void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+ void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+ int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+ int (*get_tdp_level)(void);
+ uint64_t (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, int is_mmio);
+ int (*get_lpage_level)(void);
+ int (*rdtscp_supported)(void);
+
+ const struct trace_print_flags *exit_reasons_str;
+};
+
+extern struct kvm_x86_ops *kvm_x86_ops;
+
+int kvm_mmu_module_init(void);
+void kvm_mmu_module_exit(void);
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_mmu_create(struct kvm_vcpu *vcpu);
+int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_set_nonpresent_ptes(uint64_t trap_pte, uint64_t notrap_pte);
+void kvm_mmu_set_base_ptes(uint64_t base_pte);
+void kvm_mmu_set_mask_ptes(uint64_t user_mask, uint64_t accessed_mask,
+ uint64_t dirty_mask, uint64_t nx_mask, uint64_t x_mask);
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_zap_all(struct kvm *kvm);
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+#ifdef XXX
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes);
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+ gpa_t addr, unsigned long *ret);
+uint8_t kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+extern int tdp_enabled;
+#endif /*XXX*/
+
+enum emulation_result {
+ EMULATE_DONE, /* no further processing */
+ EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
+ EMULATE_FAIL, /* can't emulate this instruction */
+};
+
+#define EMULTYPE_NO_DECODE (1 << 0)
+#define EMULTYPE_TRAP_UD (1 << 1)
+#define EMULTYPE_SKIP (1 << 2)
+
+#ifdef XXX
+int emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2, uint16_t error_code, int emulation_type);
+void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
+void realmode_lgdt(struct kvm_vcpu *vcpu, uint16_t size, unsigned long address);
+void realmode_lidt(struct kvm_vcpu *vcpu, uint16_t size, unsigned long address);
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+ unsigned long *rflags);
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
+ unsigned long *rflags);
+void kvm_enable_efer_bits(uint64_t);
+int kvm_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data);
+
+
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
+ int size, unsigned port);
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
+ int size, unsigned long count, int down,
+ gva_t address, int rep, unsigned port);
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
+int emulate_clts(struct kvm_vcpu *vcpu);
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest);
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long value);
+
+void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, int seg);
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, uint16_t tss_selector, int reason);
+
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data);
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+
+#ifdef XXX
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code);
+#endif /*XXX*/
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
+ uint32_t error_code);
+int kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
+
+int kvm_pic_set_irq(void *opaque, int irq, int level);
+
+void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+
+void fx_init(struct kvm_vcpu *vcpu);
+
+int emulator_write_emulated(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const uint8_t *new, int bytes,
+ int guest_initiated);
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, uint32_t error_code);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+
+void kvm_enable_tdp(void);
+void kvm_disable_tdp(void);
+
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int complete_pio(struct kvm_vcpu *vcpu);
+int kvm_check_iopl(struct kvm_vcpu *vcpu);
+
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
+
+static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+#endif /*XXX*/
+
+static inline unsigned short kvm_read_fs(void)
+{
+ unsigned short seg;
+ asm("mov %%fs, %0" : "=g"(seg));
+ return seg;
+}
+
+static inline unsigned short kvm_read_gs(void)
+{
+ unsigned short seg;
+ asm("mov %%gs, %0" : "=g"(seg));
+ return seg;
+}
+
+static inline unsigned short kvm_read_ldt(void)
+{
+ unsigned short ldt;
+ asm("sldt %0" : "=g"(ldt));
+ return ldt;
+}
+
+static inline void kvm_load_fs(unsigned short sel)
+{
+ asm("mov %0, %%fs" : : "rm"(sel));
+}
+
+static inline void kvm_load_gs(unsigned short sel)
+{
+ asm("mov %0, %%gs" : : "rm"(sel));
+}
+
+static inline void kvm_load_ldt(unsigned short sel)
+{
+ asm("lldt %0" : : "rm"(sel));
+}
+
+
+static inline void kvm_get_idt(struct descriptor_table *table)
+{
+ asm("sidt %0" : "=m"(*table));
+}
+
+static inline void kvm_get_gdt(struct descriptor_table *table)
+{
+ asm("sgdt %0" : "=m"(*table));
+}
+
+/*
+ * FIXME: Accessing the desc_struct through its fields is more elegant,
+ * and should be the one valid thing to do. However, a lot of open code
+ * still touches the a and b accessors, and doing this allow us to do it
+ * incrementally. We keep the signature as a struct, rather than an union,
+ * so we can get rid of it transparently in the future -- glommer
+ */
+/* 8 byte segment descriptor */
+struct desc_struct {
+ union {
+ struct {
+ unsigned int a;
+ unsigned int b;
+ }a;
+ struct {
+ unsigned short limit0;
+ unsigned short base0;
+ unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+ unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
+ }b;
+ }c;
+} __attribute__((packed));
+
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+ return (unsigned)(desc->c.b.base0 | ((desc->c.b.base1) << 16) | ((desc->c.b.base2) << 24));
+}
+
+extern unsigned long segment_base(uint16_t selector);
+
+static inline unsigned long kvm_read_tr_base(void)
+{
+ unsigned short tr;
+ asm("str %0" : "=g"(tr));
+ return segment_base(tr);
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long read_msr(unsigned long msr)
+{
+ uint64_t value;
+
+ rdmsrl(msr, value);
+ return value;
+}
+#endif
+
+#ifdef XXX
+static inline void kvm_fx_save(struct i387_fxsave_struct *image)
+{
+ asm("fxsave (%0)":: "r" (image));
+}
+
+static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
+{
+ asm("fxrstor (%0)":: "r" (image));
+}
+
+static inline void kvm_fx_finit(void)
+{
+ asm("finit");
+}
+#endif /*XXX*/
+static inline uint32_t get_rdx_init_val(void)
+{
+ return 0x600; /* P6 family */
+}
+
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, uint32_t error_code)
+{
+#ifdef XXX
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+#endif /*XXX*/
+}
+
+#define TSS_IOPB_BASE_OFFSET 0x66
+#define TSS_BASE_SIZE 0x68
+#define TSS_IOPB_SIZE (65536 / 8)
+#define TSS_REDIRECTION_SIZE (256 / 8)
+#define RMODE_TSS_SIZE \
+ (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+
+enum {
+ TASK_SWITCH_CALL = 0,
+ TASK_SWITCH_IRET = 1,
+ TASK_SWITCH_JMP = 2,
+ TASK_SWITCH_GATE = 3,
+};
+
+#define HF_GIF_MASK (1 << 0)
+#define HF_HIF_MASK (1 << 1)
+#define HF_VINTR_MASK (1 << 2)
+#define HF_NMI_MASK (1 << 3)
+#define HF_IRET_MASK (1 << 4)
+
+/*
+ * Hardware virtualization extension instructions may fault if a
+ * reboot turns off virtualization while processes are running.
+ * Trap the fault and ignore the instruction if that happens.
+ */
+
+#ifdef XXX
+#include "linkage.h"
+
+asmlinkage void kvm_handle_fault_on_reboot(void);
+
+#endif
+
+
+#define __kvm_handle_fault_on_reboot(insn) \
+ "666: " insn "\n\t" \
+ ".pushsection .fixup, \"ax\" \n" \
+ "667: \n\t" \
+ __ASM_SIZE(push) " $666b \n\t" \
+ ".popsection \n\t" \
+ ".pushsection __ex_table, \"a\" \n\t" \
+ _ASM_PTR " 666b, 667b \n\t" \
+ ".popsection \n\t"
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+
+#ifdef XXX
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+
+void kvm_define_shared_msr(unsigned index, uint32_t msr);
+void kvm_set_shared_msr(unsigned index, uint64_t val, uint64_t mask);
+#endif /*XXX*/
+#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/segment.h b/segment.h
new file mode 100644
index 0000000..a941bd6
--- /dev/null
+++ b/segment.h
@@ -0,0 +1,217 @@
+#ifndef _ASM_X86_SEGMENT_H
+#define _ASM_X86_SEGMENT_H
+
+/* Constructor for a conventional segment GDT (or LDT) entry */
+/* This is a macro so it can be used in initializers */
+#define GDT_ENTRY(flags, base, limit) \
+ ((((base) & 0xff000000ULL) << (56-24)) | \
+ (((flags) & 0x0000f0ffULL) << 40) | \
+ (((limit) & 0x000f0000ULL) << (48-16)) | \
+ (((base) & 0x00ffffffULL) << 16) | \
+ (((limit) & 0x0000ffffULL)))
+
+/* Simple and small GDT entries for booting only */
+
+#define GDT_ENTRY_BOOT_CS 2
+#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
+
+#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
+#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
+
+#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
+#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
+
+#ifdef CONFIG_X86_32
+/*
+ * The layout of the per-CPU GDT under Linux:
+ *
+ * 0 - null
+ * 1 - reserved
+ * 2 - reserved
+ * 3 - reserved
+ *
+ * 4 - unused <==== new cacheline
+ * 5 - unused
+ *
+ * ------- start of TLS (Thread-Local Storage) segments:
+ *
+ * 6 - TLS segment #1 [ glibc's TLS segment ]
+ * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
+ * 8 - TLS segment #3
+ * 9 - reserved
+ * 10 - reserved
+ * 11 - reserved
+ *
+ * ------- start of kernel segments:
+ *
+ * 12 - kernel code segment <==== new cacheline
+ * 13 - kernel data segment
+ * 14 - default user CS
+ * 15 - default user DS
+ * 16 - TSS
+ * 17 - LDT
+ * 18 - PNPBIOS support (16->32 gate)
+ * 19 - PNPBIOS support
+ * 20 - PNPBIOS support
+ * 21 - PNPBIOS support
+ * 22 - PNPBIOS support
+ * 23 - APM BIOS support
+ * 24 - APM BIOS support
+ * 25 - APM BIOS support
+ *
+ * 26 - ESPFIX small SS
+ * 27 - per-cpu [ offset to per-cpu data area ]
+ * 28 - stack_canary-20 [ for stack protector ]
+ * 29 - unused
+ * 30 - unused
+ * 31 - TSS for double fault handler
+ */
+#define GDT_ENTRY_TLS_MIN 6
+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+
+#define GDT_ENTRY_DEFAULT_USER_CS 14
+
+#define GDT_ENTRY_DEFAULT_USER_DS 15
+
+#define GDT_ENTRY_KERNEL_BASE 12
+
+#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
+
+#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
+
+#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
+#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
+
+#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
+#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
+
+#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+
+#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
+#ifdef CONFIG_SMP
+#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+#else
+#define __KERNEL_PERCPU 0
+#endif
+
+#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16)
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8)
+#else
+#define __KERNEL_STACK_CANARY 0
+#endif
+
+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
+
+/*
+ * The GDT has 32 entries
+ */
+#define GDT_ENTRIES 32
+
+/* The PnP BIOS entries in the GDT */
+#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
+#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
+#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
+#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
+#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
+
+/* The PnP BIOS selectors */
+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
+
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK 0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK 0x4
+
+/* User mode is privilege level 3 */
+#define USER_RPL 0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT 0x4
+#define SEGMENT_GDT 0x0
+
+/*
+ * Matching rules for certain types of segments.
+ */
+
+/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
+#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
+
+
+#else
+#ifdef XXX
+#include <asm/cache.h>
+#endif /*XXX*/
+#define GDT_ENTRY_KERNEL32_CS 1
+#define GDT_ENTRY_KERNEL_CS 2
+#define GDT_ENTRY_KERNEL_DS 3
+
+#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
+
+/*
+ * we cannot use the same code segment descriptor for user and kernel
+ * -- not even in the long flat mode, because of different DPL /kkeil
+ * The segment offset needs to contain a RPL. Grr. -AK
+ * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ */
+#define GDT_ENTRY_DEFAULT_USER32_CS 4
+#define GDT_ENTRY_DEFAULT_USER_DS 5
+#define GDT_ENTRY_DEFAULT_USER_CS 6
+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
+#define __USER32_DS __USER_DS
+
+#define GDT_ENTRY_TSS 8 /* needs two entries */
+#define GDT_ENTRY_LDT 10 /* needs two entries */
+#define GDT_ENTRY_TLS_MIN 12
+#define GDT_ENTRY_TLS_MAX 14
+
+#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
+#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
+
+/* TLS indexes for 64bit - hardcoded in arch_prctl */
+#define FS_TLS 0
+#define GS_TLS 1
+
+#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
+#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
+
+#define GDT_ENTRIES 16
+
+#endif
+
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
+#ifndef CONFIG_PARAVIRT
+#define get_kernel_rpl() 0
+#endif
+
+/* User mode is privilege level 3 */
+#define USER_RPL 0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT 0x4
+#define SEGMENT_GDT 0x0
+
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK 0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK 0x4
+
+#define IDT_ENTRIES 256
+#define NUM_EXCEPTION_VECTORS 32
+#define GDT_SIZE (GDT_ENTRIES * 8)
+#define GDT_ENTRY_TLS_ENTRIES 3
+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
+#endif
+#endif
+
+#endif /* _ASM_X86_SEGMENT_H */
diff --git a/tss.h b/tss.h
new file mode 100644
index 0000000..4d95c26
--- /dev/null
+++ b/tss.h
@@ -0,0 +1,59 @@
+#ifndef __TSS_SEGMENT_H
+#define __TSS_SEGMENT_H
+
+struct tss_segment_32 {
+ uint32_t prev_task_link;
+ uint32_t esp0;
+ uint32_t ss0;
+ uint32_t esp1;
+ uint32_t ss1;
+ uint32_t esp2;
+ uint32_t ss2;
+ uint32_t cr3;
+ uint32_t eip;
+ uint32_t eflags;
+ uint32_t eax;
+ uint32_t ecx;
+ uint32_t edx;
+ uint32_t ebx;
+ uint32_t esp;
+ uint32_t ebp;
+ uint32_t esi;
+ uint32_t edi;
+ uint32_t es;
+ uint32_t cs;
+ uint32_t ss;
+ uint32_t ds;
+ uint32_t fs;
+ uint32_t gs;
+ uint32_t ldt_selector;
+ uint16_t t;
+ uint16_t io_map;
+};
+
+struct tss_segment_16 {
+ uint16_t prev_task_link;
+ uint16_t sp0;
+ uint16_t ss0;
+ uint16_t sp1;
+ uint16_t ss1;
+ uint16_t sp2;
+ uint16_t ss2;
+ uint16_t ip;
+ uint16_t flag;
+ uint16_t ax;
+ uint16_t cx;
+ uint16_t dx;
+ uint16_t bx;
+ uint16_t sp;
+ uint16_t bp;
+ uint16_t si;
+ uint16_t di;
+ uint16_t es;
+ uint16_t cs;
+ uint16_t ss;
+ uint16_t ds;
+ uint16_t ldt;
+};
+
+#endif