summaryrefslogtreecommitdiff
path: root/usr/src/uts/i86pc/sys
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/i86pc/sys')
-rw-r--r--usr/src/uts/i86pc/sys/Makefile15
-rw-r--r--usr/src/uts/i86pc/sys/apic.h2
-rw-r--r--usr/src/uts/i86pc/sys/comm_page.h1
-rw-r--r--usr/src/uts/i86pc/sys/gipt.h92
-rw-r--r--usr/src/uts/i86pc/sys/hma.h39
-rw-r--r--usr/src/uts/i86pc/sys/machcpuvar.h8
-rw-r--r--usr/src/uts/i86pc/sys/machparam.h190
-rw-r--r--usr/src/uts/i86pc/sys/machsystm.h2
-rw-r--r--usr/src/uts/i86pc/sys/ppt_dev.h56
-rw-r--r--usr/src/uts/i86pc/sys/viona_io.h63
-rw-r--r--usr/src/uts/i86pc/sys/vm_machparam.h4
-rw-r--r--usr/src/uts/i86pc/sys/vmm.h748
-rw-r--r--usr/src/uts/i86pc/sys/vmm_dev.h520
-rw-r--r--usr/src/uts/i86pc/sys/vmm_drv.h53
-rw-r--r--usr/src/uts/i86pc/sys/vmm_impl.h89
-rw-r--r--usr/src/uts/i86pc/sys/vmm_instruction_emul.h137
16 files changed, 1877 insertions, 142 deletions
diff --git a/usr/src/uts/i86pc/sys/Makefile b/usr/src/uts/i86pc/sys/Makefile
index 292cd04c2b..3d8332a930 100644
--- a/usr/src/uts/i86pc/sys/Makefile
+++ b/usr/src/uts/i86pc/sys/Makefile
@@ -37,7 +37,7 @@ include ../Makefile.i86pc
#
FILEMODE = 644
-HDRS= \
+CHKHDRS= \
acpidev.h \
amd_iommu.h \
asm_misc.h \
@@ -46,6 +46,7 @@ HDRS= \
ddi_subrdefs.h \
debug_info.h \
fastboot.h \
+ hma.h \
mach_mmu.h \
machclock.h \
machcpuvar.h \
@@ -68,6 +69,16 @@ HDRS= \
xc_levels.h \
xsvc.h
+NOCHKHDRS= \
+ vmm.h \
+ vmm_dev.h \
+ vmm_impl.h \
+ vmm_instruction_emul.h
+
+HDRS= \
+ $(CHKHDRS) \
+ $(NOCHKHDRS)
+
ROOTHDRS= $(HDRS:%=$(USR_PSM_ISYS_DIR)/%)
ROOTDIR= $(ROOT)/usr/share/src
@@ -76,7 +87,7 @@ ROOTDIRS= $(ROOTDIR)/uts $(ROOTDIR)/uts/$(PLATFORM)
ROOTLINK= $(ROOTDIR)/uts/$(PLATFORM)/sys
LINKDEST= ../../../../platform/$(PLATFORM)/include/sys
-CHECKHDRS= $(HDRS:%.h=%.check)
+CHECKHDRS= $(CHKHDRS:%.h=%.check)
.KEEP_STATE:
diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h
index 26626ec5a4..f2528a632f 100644
--- a/usr/src/uts/i86pc/sys/apic.h
+++ b/usr/src/uts/i86pc/sys/apic.h
@@ -386,7 +386,7 @@ struct apic_io_intr {
/* special or reserve vectors */
#define APIC_CHECK_RESERVE_VECTORS(v) \
(((v) == T_FASTTRAP) || ((v) == APIC_SPUR_INTR) || \
- ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET))
+ ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET) || ((v) == 0x80))
/* cmos shutdown code for BIOS */
#define BIOS_SHUTDOWN 0x0a
diff --git a/usr/src/uts/i86pc/sys/comm_page.h b/usr/src/uts/i86pc/sys/comm_page.h
index 520ad9001d..ea19c856a8 100644
--- a/usr/src/uts/i86pc/sys/comm_page.h
+++ b/usr/src/uts/i86pc/sys/comm_page.h
@@ -27,6 +27,7 @@ extern "C" {
#endif
#define COMM_PAGE_SIZE PAGESIZE
+#define COMM_PAGE_ALIGN 0x4000
#ifndef _ASM
diff --git a/usr/src/uts/i86pc/sys/gipt.h b/usr/src/uts/i86pc/sys/gipt.h
new file mode 100644
index 0000000000..4d7d523726
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/gipt.h
@@ -0,0 +1,92 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _GIPT_H_
+#define _GIPT_H_
+
+#include <sys/types.h>
+#include <sys/mutex.h>
+#include <sys/param.h>
+#include <sys/list.h>
+
+struct gipt {
+ list_node_t gipt_node;
+ uint64_t gipt_vaddr;
+ uint64_t gipt_pfn;
+ uint16_t gipt_level;
+ uint16_t gipt_valid_cnt;
+ uint32_t _gipt_pad;
+ struct gipt *gipt_parent;
+ uint64_t *gipt_kva;
+ uint64_t _gipt_pad2;
+};
+typedef struct gipt gipt_t;
+
+typedef enum {
+ PTET_EMPTY = 0,
+ PTET_PAGE = 1,
+ PTET_LINK = 2,
+} gipt_pte_type_t;
+
+/* Given a PTE and its level, determine the type of that PTE */
+typedef gipt_pte_type_t (*gipt_pte_type_cb_t)(uint64_t, uint_t);
+/* Given the PFN of a child table, emit a PTE that references it */
+typedef uint64_t (*gipt_pte_map_cb_t)(uint64_t);
+
+struct gipt_cbs {
+ gipt_pte_type_cb_t giptc_pte_type;
+ gipt_pte_map_cb_t giptc_pte_map;
+};
+
+struct gipt_map {
+ kmutex_t giptm_lock;
+ gipt_t *giptm_root;
+ list_t *giptm_hash;
+ struct gipt_cbs giptm_cbs;
+ size_t giptm_table_cnt;
+ uint_t giptm_levels;
+};
+typedef struct gipt_map gipt_map_t;
+
+#define GIPT_HASH_SIZE_DEFAULT 0x2000
+#define GIPT_MAX_LEVELS 4
+
+#define GIPT_VA2IDX(pt, va) \
+ (((va) - (pt)->gipt_vaddr) >> \
+ gipt_level_shift[(pt)->gipt_level])
+
+#define GIPT_VA2PTE(pt, va) ((pt)->gipt_kva[GIPT_VA2IDX(pt, va)])
+#define GIPT_VA2PTEP(pt, va) (&(pt)->gipt_kva[GIPT_VA2IDX(pt, va)])
+
+extern const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1];
+extern const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1];
+extern const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1];
+extern const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1];
+
+extern gipt_t *gipt_alloc(void);
+extern void gipt_free(gipt_t *);
+extern void gipt_map_init(gipt_map_t *, uint_t, uint_t,
+ const struct gipt_cbs *, gipt_t *);
+extern void gipt_map_fini(gipt_map_t *);
+extern gipt_t *gipt_map_lookup(gipt_map_t *, uint64_t, uint_t);
+extern gipt_t *gipt_map_lookup_deepest(gipt_map_t *, uint64_t);
+extern uint64_t gipt_map_next_page(gipt_map_t *, uint64_t, uint64_t,
+ gipt_t **);
+extern void gipt_map_insert(gipt_map_t *, gipt_t *);
+extern void gipt_map_remove(gipt_map_t *, gipt_t *);
+extern gipt_t *gipt_map_create_parents(gipt_map_t *, uint64_t, uint_t);
+extern void gipt_map_clean_parents(gipt_map_t *, gipt_t *);
+
+#endif /* _GIPT_H_ */
diff --git a/usr/src/uts/i86pc/sys/hma.h b/usr/src/uts/i86pc/sys/hma.h
index 00009cf439..16ab708896 100644
--- a/usr/src/uts/i86pc/sys/hma.h
+++ b/usr/src/uts/i86pc/sys/hma.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _SYS_HMA_H
@@ -30,6 +30,40 @@
extern "C" {
#endif
+
+/*
+ * Register a hypervisor with HMA. On success, a pointer to the opaque
+ * registration token will be returned, indicating that proper host setup has
+ * occurred for further hypervisor actions.
+ */
+typedef struct hma_reg hma_reg_t;
+extern hma_reg_t *hma_register(const char *);
+extern hma_reg_t *hma_register_exclusive(const char *);
+extern void hma_unregister(hma_reg_t *);
+
+/*
+ * Allocate or free a VPID for use with VMX.
+ *
+ * This must not be performed by a hypervisor until it has successfully
+ * registered via hma_register().
+ */
+extern uint16_t hma_vmx_vpid_alloc(void);
+extern void hma_vmx_vpid_free(uint16_t);
+
+/*
+ * On all active CPUs, perform a single-context INVEPT on the given EPTP.
+ */
+extern void hma_vmx_invept_allcpus(uintptr_t);
+
+struct hma_svm_asid {
+ uint64_t hsa_gen;
+ uint32_t hsa_asid;
+};
+typedef struct hma_svm_asid hma_svm_asid_t;
+
+extern void hma_svm_asid_init(hma_svm_asid_t *);
+extern uint8_t hma_svm_asid_update(hma_svm_asid_t *, boolean_t, boolean_t);
+
/*
* FPU related management. These functions provide a set of APIs to manage the
* FPU state and switch between host and guest management of this state.
@@ -96,6 +130,9 @@ extern void hma_fpu_stop_guest(hma_fpu_t *);
extern void hma_fpu_get_fxsave_state(const hma_fpu_t *, struct fxsave_state *);
extern int hma_fpu_set_fxsave_state(hma_fpu_t *, const struct fxsave_state *);
+/* Perform HMA initialization steps during boot-up. */
+extern void hma_init(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h
index f4e38dec98..772f3112cb 100644
--- a/usr/src/uts/i86pc/sys/machcpuvar.h
+++ b/usr/src/uts/i86pc/sys/machcpuvar.h
@@ -81,6 +81,12 @@ struct xen_evt_data {
ulong_t evt_affinity[sizeof (ulong_t) * 8]; /* service on cpu */
};
+enum fast_syscall_state {
+ FSS_DISABLED = 0,
+ FSS_ASYSC_ENABLED = (1 << 0),
+ FSS_SEP_ENABLED = (1 << 1)
+};
+
struct kpti_frame {
uint64_t kf_lower_redzone;
@@ -214,6 +220,8 @@ struct machcpu {
uint16_t mcpu_idle_type; /* CPU next idle type */
uint16_t max_cstates; /* supported max cstates */
+ enum fast_syscall_state mcpu_fast_syscall_state;
+
struct cpu_ucode_info *mcpu_ucode_info;
void *mcpu_pm_mach_state;
diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h
index 51d7559483..f79b582df4 100644
--- a/usr/src/uts/i86pc/sys/machparam.h
+++ b/usr/src/uts/i86pc/sys/machparam.h
@@ -31,14 +31,15 @@
#ifndef _SYS_MACHPARAM_H
#define _SYS_MACHPARAM_H
-#if !defined(_ASM)
+#ifndef _ASM
+
#include <sys/types.h>
#if defined(__xpv)
#include <sys/xpv_impl.h>
#endif
-#endif
+#endif /* !_ASM */
#ifdef __cplusplus
extern "C" {
@@ -54,17 +55,12 @@ extern "C" {
* Machine dependent parameters and limits.
*/
-#if defined(__amd64)
/*
* If NCPU grows beyond 256, sizing for the x86 comm page will require
* adjustment.
*/
#define NCPU 256
#define NCPU_LOG2 8
-#elif defined(__i386)
-#define NCPU 32
-#define NCPU_LOG2 5
-#endif
/* NCPU_P2 is NCPU rounded to a power of 2 */
#define NCPU_P2 (1 << NCPU_LOG2)
@@ -116,11 +112,7 @@ extern "C" {
/*
* DEFAULT KERNEL THREAD stack size (in pages).
*/
-#if defined(__amd64)
#define DEFAULTSTKSZ_NPGS 5
-#elif defined(__i386)
-#define DEFAULTSTKSZ_NPGS 3
-#endif
#if !defined(_ASM)
#define DEFAULTSTKSZ (DEFAULTSTKSZ_NPGS * PAGESIZE)
@@ -129,43 +121,42 @@ extern "C" {
#endif /* !_ASM */
/*
- * KERNELBASE is the virtual address at which the kernel segments start in
- * all contexts.
- *
- * KERNELBASE is not fixed. The value of KERNELBASE can change with
- * installed memory or on 32 bit systems the eprom variable 'eprom_kernelbase'.
- *
- * common/conf/param.c requires a compile time defined value for KERNELBASE.
- * This value is save in the variable _kernelbase. _kernelbase may then be
- * modified with to a different value in i86pc/os/startup.c.
- *
- * Most code should be using kernelbase, which resolves to a reference to
- * _kernelbase.
+ * During intial boot we limit heap to the top 4Gig.
*/
-#define KERNEL_TEXT_amd64 UINT64_C(0xfffffffffb800000)
-
-#ifdef __i386
-
-#define KERNEL_TEXT_i386 ADDRESS_C(0xfe800000)
+#define BOOT_KERNELHEAP_BASE ADDRESS_C(0xffffffff00000000)
/*
- * We don't use HYPERVISOR_VIRT_START, as we need both the PAE and non-PAE
- * versions in our code. We always compile based on the lower PAE address.
+ * VMWare works best if we don't use the top 64Meg of memory for amd64.
+ * Set KERNEL_TEXT to top_o_memory - 64Meg - 8 Meg for 8Meg of nucleus pages.
*/
-#define KERNEL_TEXT_i386_xpv \
- (HYPERVISOR_VIRT_START_PAE - 3 * ADDRESS_C(0x400000))
-
-#endif /* __i386 */
+#define PROMSTART ADDRESS_C(0xffc00000)
-#if defined(__amd64)
+/*
+ * Virtual address range available to the debugger
+ */
+#define SEGDEBUGBASE ADDRESS_C(0xffffffffff800000)
+#define SEGDEBUGSIZE ADDRESS_C(0x400000)
-#define KERNELBASE ADDRESS_C(0xfffffd8000000000)
+#define KERNEL_TEXT UINT64_C(0xfffffffffb800000)
/*
- * Size of the unmapped "red zone" at the very bottom of the kernel's
- * address space. Corresponds to 1 slot in the toplevel pagetable.
+ * Reserve pages just below KERNEL_TEXT for the GDT, IDT, LDT, TSS and debug
+ * info.
+ *
+ * For now, DEBUG_INFO_VA must be first in this list for "xm" initiated dumps
+ * of solaris domUs to be usable with mdb. Relying on a fixed VA is not viable
+ * long term, but it's the best we've got for now.
*/
-#define KERNEL_REDZONE_SIZE ((uintptr_t)1 << 39)
+#if !defined(_ASM)
+#define DEBUG_INFO_VA (KERNEL_TEXT - MMU_PAGESIZE)
+#define GDT_VA (DEBUG_INFO_VA - MMU_PAGESIZE)
+#define IDT_VA (GDT_VA - MMU_PAGESIZE)
+#define LDT_VA (IDT_VA - (16 * MMU_PAGESIZE))
+#define KTSS_VA (LDT_VA - MMU_PAGESIZE)
+#define DFTSS_VA (KTSS_VA - MMU_PAGESIZE)
+#define MISC_VA_BASE (DFTSS_VA)
+#define MISC_VA_SIZE (KERNEL_TEXT - MISC_VA_BASE)
+#endif /* !_ASM */
/*
* Base of 'core' heap area, which is used for kernel and module text/data
@@ -174,52 +165,47 @@ extern "C" {
#define COREHEAP_BASE ADDRESS_C(0xffffffffc0000000)
/*
- * Beginning of the segkpm window. A lower value than this is used if
- * physical addresses exceed 1TB. See i86pc/os/startup.c
- */
-#define SEGKPM_BASE ADDRESS_C(0xfffffe0000000000)
-
-/*
* This is valloc_base, above seg_kpm, but below everything else.
* A lower value than this may be used if SEGKPM_BASE is adjusted.
* See i86pc/os/startup.c
*/
-#define VALLOC_BASE ADDRESS_C(0xffffff0000000000)
+#define VALLOC_BASE ADDRESS_C(0xfffffe0000000000)
+
+#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */
+#define SEGVMMMINSIZE (4096L * 1024 * 1024L) /* 4G */
-/*
- * default and boundary sizes for segkp
- */
#define SEGKPDEFSIZE (2L * 1024L * 1024L * 1024L) /* 2G */
#define SEGKPMAXSIZE (8L * 1024L * 1024L * 1024L) /* 8G */
#define SEGKPMINSIZE (200L * 1024 * 1024L) /* 200M */
-/*
- * minimum size for segzio
- */
-#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */
-
-/*
- * During intial boot we limit heap to the top 4Gig.
- */
-#define BOOT_KERNELHEAP_BASE ADDRESS_C(0xffffffff00000000)
+#define SEGKPM_BASE ADDRESS_C(0xfffffd0000000000)
/*
- * VMWare works best if we don't use the top 64Meg of memory for amd64.
- * Set KERNEL_TEXT to top_o_memory - 64Meg - 8 Meg for 8Meg of nucleus pages.
+ * KERNELBASE is the virtual address at which the kernel segments start in
+ * all contexts.
+ *
+ * KERNELBASE is not fixed. The value of KERNELBASE can change with
+ * installed memory size.
+ *
+ * common/conf/param.c requires a compile time defined value for KERNELBASE.
+ * This value is save in the variable _kernelbase. _kernelbase may then be
+ * modified with to a different value in i86pc/os/startup.c.
+ *
+ * Most code should be using kernelbase, which resolves to a reference to
+ * _kernelbase.
*/
-#define PROMSTART ADDRESS_C(0xffc00000)
-#define KERNEL_TEXT KERNEL_TEXT_amd64
+#define KERNELBASE ADDRESS_C(0xfffffc8000000000)
/*
- * Virtual address range available to the debugger
+ * Size of the unmapped "red zone" at the very bottom of the kernel's
+ * address space. Corresponds to 1 slot in the toplevel pagetable.
*/
-#define SEGDEBUGBASE ADDRESS_C(0xffffffffff800000)
-#define SEGDEBUGSIZE ADDRESS_C(0x400000)
+#define KERNEL_REDZONE_SIZE ((uintptr_t)1 << 39)
/*
* Define upper limit on user address space
*
- * In amd64, the upper limit on a 64-bit user address space is 1 large page
+ * The upper limit on a 64-bit user address space is 1 large page
* (2MB) below kernelbase. The upper limit for a 32-bit user address space
* is 1 small page (4KB) below the top of the 32-bit range. The 64-bit
* limit give dtrace the red zone it needs below kernelbase. The 32-bit
@@ -232,7 +218,7 @@ extern "C" {
#if defined(__xpv)
#define USERLIMIT ADDRESS_C(0x00007fffffe00000)
#else
-#define USERLIMIT ADDRESS_C(0xfffffd7fffe00000)
+#define USERLIMIT ADDRESS_C(0xfffffc7fffe00000)
#endif
#ifdef bug_5074717_is_fixed
@@ -241,76 +227,6 @@ extern "C" {
#define USERLIMIT32 ADDRESS_C(0xfefff000)
#endif
-#elif defined(__i386)
-
-#ifdef DEBUG
-#define KERNELBASE ADDRESS_C(0xc8000000)
-#else
-#define KERNELBASE ADDRESS_C(0xd4000000)
-#endif
-
-#define KERNELBASE_MAX ADDRESS_C(0xe0000000)
-
-/*
- * The i386 ABI requires that the user address space be at least 3Gb
- * in size. KERNELBASE_ABI_MIN is used as the default KERNELBASE for
- * physical memory configurations > 4gb.
- */
-#define KERNELBASE_ABI_MIN ADDRESS_C(0xc0000000)
-
-/*
- * Size of the unmapped "red zone" at the very bottom of the kernel's
- * address space. Since segmap start immediately above the red zone, this
- * needs to be MAXBSIZE aligned.
- */
-#define KERNEL_REDZONE_SIZE MAXBSIZE
-
-/*
- * This is the last 4MB of the 4G address space. Some psm modules
- * need this region of virtual address space mapped 1-1
- * The top 64MB of the address space is reserved for the hypervisor.
- */
-#define PROMSTART ADDRESS_C(0xffc00000)
-#ifdef __xpv
-#define KERNEL_TEXT KERNEL_TEXT_i386_xpv
-#else
-#define KERNEL_TEXT KERNEL_TEXT_i386
-#endif
-
-/*
- * Virtual address range available to the debugger
- * We place it just above the kernel text (4M) and kernel data (4M).
- */
-#define SEGDEBUGBASE (KERNEL_TEXT + ADDRESS_C(0x800000))
-#define SEGDEBUGSIZE ADDRESS_C(0x400000)
-
-/*
- * Define upper limit on user address space
- */
-#define USERLIMIT KERNELBASE
-#define USERLIMIT32 USERLIMIT
-
-#endif /* __i386 */
-
-/*
- * Reserve pages just below KERNEL_TEXT for the GDT, IDT, LDT, TSS and debug
- * info.
- *
- * For now, DEBUG_INFO_VA must be first in this list for "xm" initiated dumps
- * of solaris domUs to be usable with mdb. Relying on a fixed VA is not viable
- * long term, but it's the best we've got for now.
- */
-#if !defined(_ASM)
-#define DEBUG_INFO_VA (KERNEL_TEXT - MMU_PAGESIZE)
-#define GDT_VA (DEBUG_INFO_VA - MMU_PAGESIZE)
-#define IDT_VA (GDT_VA - MMU_PAGESIZE)
-#define LDT_VA (IDT_VA - (16 * MMU_PAGESIZE))
-#define KTSS_VA (LDT_VA - MMU_PAGESIZE)
-#define DFTSS_VA (KTSS_VA - MMU_PAGESIZE)
-#define MISC_VA_BASE (DFTSS_VA)
-#define MISC_VA_SIZE (KERNEL_TEXT - MISC_VA_BASE)
-#endif /* !_ASM */
-
#if !defined(_ASM) && !defined(_KMDB)
extern uintptr_t kernelbase, segmap_start, segmapsize;
#endif
diff --git a/usr/src/uts/i86pc/sys/machsystm.h b/usr/src/uts/i86pc/sys/machsystm.h
index 7409c5af4a..5f286ca4c6 100644
--- a/usr/src/uts/i86pc/sys/machsystm.h
+++ b/usr/src/uts/i86pc/sys/machsystm.h
@@ -25,6 +25,7 @@
/*
* Copyright (c) 2010, Intel Corporation.
* All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_MACHSYSTM_H
@@ -231,6 +232,7 @@ extern page_t *page_get_high_mfn(mfn_t);
#endif
extern hrtime_t tsc_gethrtime_tick_delta(void);
+extern hrtime_t tsc_gethrtime_params(uint64_t *, uint32_t *, uint8_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/i86pc/sys/ppt_dev.h b/usr/src/uts/i86pc/sys/ppt_dev.h
new file mode 100644
index 0000000000..e25f941f14
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/ppt_dev.h
@@ -0,0 +1,56 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc
+ */
+
+#ifndef _PPT_DEV_H
+#define _PPT_DEV_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PPT_IOC (('P' << 16)|('T' << 8))
+
+#define PPT_CFG_READ (PPT_IOC | 0x01)
+#define PPT_CFG_WRITE (PPT_IOC | 0x02)
+#define PPT_BAR_QUERY (PPT_IOC | 0x03)
+#define PPT_BAR_READ (PPT_IOC | 0x04)
+#define PPT_BAR_WRITE (PPT_IOC | 0x05)
+
+#define PPT_MAXNAMELEN 32
+
+struct ppt_cfg_io {
+ uint64_t pci_off;
+ uint32_t pci_width;
+ uint32_t pci_data;
+};
+struct ppt_bar_io {
+ uint32_t pbi_bar;
+ uint32_t pbi_off;
+ uint32_t pbi_width;
+ uint32_t pbi_data;
+};
+
+struct ppt_bar_query {
+ uint32_t pbq_baridx;
+ uint32_t pbq_type;
+ uint64_t pbq_base;
+ uint64_t pbq_size;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PPT_DEV_H */
diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h
new file mode 100644
index 0000000000..46cc72eb06
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/viona_io.h
@@ -0,0 +1,63 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _VIONA_IO_H_
+#define _VIONA_IO_H_
+
+#define VNA_IOC (('V' << 16)|('C' << 8))
+#define VNA_IOC_CREATE (VNA_IOC | 0x01)
+#define VNA_IOC_DELETE (VNA_IOC | 0x02)
+
+#define VNA_IOC_RING_INIT (VNA_IOC | 0x10)
+#define VNA_IOC_RING_RESET (VNA_IOC | 0x11)
+#define VNA_IOC_RING_KICK (VNA_IOC | 0x12)
+#define VNA_IOC_RING_SET_MSI (VNA_IOC | 0x13)
+#define VNA_IOC_RING_INTR_CLR (VNA_IOC | 0x14)
+
+#define VNA_IOC_INTR_POLL (VNA_IOC | 0x20)
+#define VNA_IOC_SET_FEATURES (VNA_IOC | 0x21)
+#define VNA_IOC_GET_FEATURES (VNA_IOC | 0x22)
+#define VNA_IOC_SET_NOTIFY_IOP (VNA_IOC | 0x23)
+
+typedef struct vioc_create {
+ datalink_id_t c_linkid;
+ int c_vmfd;
+} vioc_create_t;
+
+typedef struct vioc_ring_init {
+ uint16_t ri_index;
+ uint16_t ri_qsize;
+ uint64_t ri_qaddr;
+} vioc_ring_init_t;
+
+typedef struct vioc_ring_msi {
+ uint16_t rm_index;
+ uint64_t rm_addr;
+ uint64_t rm_msg;
+} vioc_ring_msi_t;
+
+enum viona_vq_id {
+ VIONA_VQ_RX = 0,
+ VIONA_VQ_TX = 1,
+ VIONA_VQ_MAX = 2
+};
+
+typedef struct vioc_intr_poll {
+ uint32_t vip_status[VIONA_VQ_MAX];
+} vioc_intr_poll_t;
+
+
+#endif /* _VIONA_IO_H_ */
diff --git a/usr/src/uts/i86pc/sys/vm_machparam.h b/usr/src/uts/i86pc/sys/vm_machparam.h
index 90a5245217..fde81e59ed 100644
--- a/usr/src/uts/i86pc/sys/vm_machparam.h
+++ b/usr/src/uts/i86pc/sys/vm_machparam.h
@@ -23,6 +23,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#ifndef _SYS_VM_MACHPARAM_H
@@ -133,7 +134,8 @@ extern "C" {
/*
* The maximum value for handspreadpages which is the the distance
- * between the two clock hands in pages.
+ * between the two clock hands in pages. This is only used when the page
+ * scanner is first started.
*/
#define MAXHANDSPREADPAGES ((64 * 1024 * 1024) / PAGESIZE)
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
new file mode 100644
index 0000000000..ac8f14b042
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -0,0 +1,748 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _VMM_H_
+#define _VMM_H_
+
+#include <sys/sdt.h>
+#include <x86/segments.h>
+
+#ifdef _KERNEL
+SDT_PROVIDER_DECLARE(vmm);
+#endif
+
+enum vm_suspend_how {
+ VM_SUSPEND_NONE,
+ VM_SUSPEND_RESET,
+ VM_SUSPEND_POWEROFF,
+ VM_SUSPEND_HALT,
+ VM_SUSPEND_TRIPLEFAULT,
+ VM_SUSPEND_LAST
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15,
+ VM_REG_GUEST_CR0,
+ VM_REG_GUEST_CR3,
+ VM_REG_GUEST_CR4,
+ VM_REG_GUEST_DR7,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_RIP,
+ VM_REG_GUEST_RFLAGS,
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_CS,
+ VM_REG_GUEST_SS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS,
+ VM_REG_GUEST_LDTR,
+ VM_REG_GUEST_TR,
+ VM_REG_GUEST_IDTR,
+ VM_REG_GUEST_GDTR,
+ VM_REG_GUEST_EFER,
+ VM_REG_GUEST_CR2,
+ VM_REG_GUEST_PDPTE0,
+ VM_REG_GUEST_PDPTE1,
+ VM_REG_GUEST_PDPTE2,
+ VM_REG_GUEST_PDPTE3,
+ VM_REG_GUEST_INTR_SHADOW,
+ VM_REG_GUEST_DR0,
+ VM_REG_GUEST_DR1,
+ VM_REG_GUEST_DR2,
+ VM_REG_GUEST_DR3,
+ VM_REG_GUEST_DR6,
+ VM_REG_LAST
+};
+
+enum x2apic_state {
+ X2APIC_DISABLED,
+ X2APIC_ENABLED,
+ X2APIC_STATE_LAST
+};
+
+#define VM_INTINFO_VECTOR(info) ((info) & 0xff)
+#define VM_INTINFO_DEL_ERRCODE 0x800
+#define VM_INTINFO_RSVD 0x7ffff000
+#define VM_INTINFO_VALID 0x80000000
+#define VM_INTINFO_TYPE 0x700
+#define VM_INTINFO_HWINTR (0 << 8)
+#define VM_INTINFO_NMI (2 << 8)
+#define VM_INTINFO_HWEXCEPTION (3 << 8)
+#define VM_INTINFO_SWINTR (4 << 8)
+
+
+#define VM_MAX_NAMELEN 32
+
+#ifdef _KERNEL
+
+struct vm;
+struct vm_exception;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vhpet;
+struct vioapic;
+struct vlapic;
+struct vmspace;
+struct vm_object;
+struct vm_guest_paging;
+struct pmap;
+
+struct vm_eventinfo {
+ u_int *rptr; /* runblock cookie */
+ int *sptr; /* suspend cookie */
+ int *iptr; /* reqidle cookie */
+};
+
+typedef int (*vmm_init_func_t)(int ipinum);
+typedef int (*vmm_cleanup_func_t)(void);
+typedef void (*vmm_resume_func_t)(void);
+typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
+typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
+ struct pmap *pmap, struct vm_eventinfo *info);
+typedef void (*vmi_cleanup_func_t)(void *vmi);
+typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
+ uint64_t *retval);
+typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
+ uint64_t val);
+typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
+typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
+typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
+typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
+#ifndef __FreeBSD__
+typedef void (*vmi_savectx)(void *vmi, int vcpu);
+typedef void (*vmi_restorectx)(void *vmi, int vcpu);
+#endif
+
+struct vmm_ops {
+ vmm_init_func_t init; /* module wide initialization */
+ vmm_cleanup_func_t cleanup;
+ vmm_resume_func_t resume;
+
+ vmi_init_func_t vminit; /* vm-specific initialization */
+ vmi_run_func_t vmrun;
+ vmi_cleanup_func_t vmcleanup;
+ vmi_get_register_t vmgetreg;
+ vmi_set_register_t vmsetreg;
+ vmi_get_desc_t vmgetdesc;
+ vmi_set_desc_t vmsetdesc;
+ vmi_get_cap_t vmgetcap;
+ vmi_set_cap_t vmsetcap;
+ vmi_vmspace_alloc vmspace_alloc;
+ vmi_vmspace_free vmspace_free;
+ vmi_vlapic_init vlapic_init;
+ vmi_vlapic_cleanup vlapic_cleanup;
+
+#ifndef __FreeBSD__
+ vmi_savectx vmsavectx;
+ vmi_restorectx vmrestorectx;
+#endif
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+int vm_create(const char *name, struct vm **retvm);
+void vm_destroy(struct vm *vm);
+int vm_reinit(struct vm *vm);
+const char *vm_name(struct vm *vm);
+uint16_t vm_get_maxcpus(struct vm *vm);
+void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
+ uint16_t *threads, uint16_t *maxcpus);
+int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
+ uint16_t threads, uint16_t maxcpus);
+
+/*
+ * APIs that modify the guest memory map require all vcpus to be frozen.
+ */
+int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
+ size_t len, int prot, int flags);
+int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
+void vm_free_memseg(struct vm *vm, int ident);
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+#ifdef __FreeBSD__
+int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
+int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
+#else
+int vm_assign_pptdev(struct vm *vm, int pptfd);
+int vm_unassign_pptdev(struct vm *vm, int pptfd);
+#endif /* __FreeBSD__ */
+
+/*
+ * APIs that inspect the guest memory map require only a *single* vcpu to
+ * be frozen. This acts like a read lock on the guest memory map since any
+ * modification requires *all* vcpus to be frozen.
+ */
+int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
+ vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
+int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
+ struct vm_object **objptr);
+vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm);
+void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len,
+ int prot, void **cookie);
+void vm_gpa_release(void *cookie);
+bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa);
+
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_suspend(struct vm *vm, enum vm_suspend_how how);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+int vm_nmi_pending(struct vm *vm, int vcpuid);
+void vm_nmi_clear(struct vm *vm, int vcpuid);
+int vm_inject_extint(struct vm *vm, int vcpu);
+int vm_extint_pending(struct vm *vm, int vcpuid);
+void vm_extint_clear(struct vm *vm, int vcpuid);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+struct vioapic *vm_ioapic(struct vm *vm);
+struct vhpet *vm_hpet(struct vm *vm);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
+int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
+int vm_apicid2vcpuid(struct vm *vm, int apicid);
+int vm_activate_cpu(struct vm *vm, int vcpu);
+int vm_suspend_cpu(struct vm *vm, int vcpu);
+int vm_resume_cpu(struct vm *vm, int vcpu);
+struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
+void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
+
+#ifdef _SYS__CPUSET_H_
+cpuset_t vm_active_cpus(struct vm *vm);
+cpuset_t vm_debug_cpus(struct vm *vm);
+cpuset_t vm_suspended_cpus(struct vm *vm);
+#endif /* _SYS__CPUSET_H_ */
+
+static __inline int
+vcpu_runblocked(struct vm_eventinfo *info)
+{
+
+ return (*info->rptr != 0);
+}
+
+static __inline int
+vcpu_suspended(struct vm_eventinfo *info)
+{
+
+ return (*info->sptr);
+}
+
+static __inline int
+vcpu_reqidle(struct vm_eventinfo *info)
+{
+
+ return (*info->iptr);
+}
+
+int vcpu_debugged(struct vm *vm, int vcpuid);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+enum vcpu_state {
+ VCPU_IDLE,
+ VCPU_FROZEN,
+ VCPU_RUNNING,
+ VCPU_SLEEPING,
+};
+
+int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
+ bool from_idle);
+enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
+void vcpu_block_run(struct vm *, int);
+void vcpu_unblock_run(struct vm *, int);
+
+#ifndef __FreeBSD__
+uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid);
+#endif
+
+static __inline int
+vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
+{
+ return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
+}
+
+#ifdef _SYS_THREAD_H
+static __inline int
+vcpu_should_yield(struct vm *vm, int vcpu)
+{
+
+ if (curthread->t_astflag)
+ return (1);
+ else if (CPU->cpu_runrun)
+ return (1);
+ else
+ return (0);
+}
+#endif /* _SYS_THREAD_H */
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
+struct vmspace *vm_get_vmspace(struct vm *vm);
+struct vatpic *vm_atpic(struct vm *vm);
+struct vatpit *vm_atpit(struct vm *vm);
+struct vpmtmr *vm_pmtmr(struct vm *vm);
+struct vrtc *vm_rtc(struct vm *vm);
+
+/*
+ * Inject exception 'vector' into the guest vcpu. This function returns 0 on
+ * success and non-zero on failure.
+ *
+ * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
+ * this function directly because they enforce the trap-like or fault-like
+ * behavior of an exception.
+ *
+ * This function should only be called in the context of the thread that is
+ * executing this vcpu.
+ */
+int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid,
+ uint32_t errcode, int restart_instruction);
+
+/*
+ * This function is called after a VM-exit that occurred during exception or
+ * interrupt delivery through the IDT. The format of 'intinfo' is described
+ * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
+ *
+ * If a VM-exit handler completes the event delivery successfully then it
+ * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
+ * if the task switch emulation is triggered via a task gate then it should
+ * call this function with 'intinfo=0' to indicate that the external event
+ * is not pending anymore.
+ *
+ * Return value is 0 on success and non-zero on failure.
+ */
+int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
+
+/*
+ * This function is called before every VM-entry to retrieve a pending
+ * event that should be injected into the guest. This function combines
+ * nested events into a double or triple fault.
+ *
+ * Returns 0 if there are no events that need to be injected into the guest
+ * and non-zero otherwise.
+ */
+int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
+
+int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
+
+enum vm_reg_name vm_segment_name(int seg_encoding);
+
+struct vm_copyinfo {
+ uint64_t gpa;
+ size_t len;
+ void *hva;
+ void *cookie;
+};
+
+/*
+ * Set up 'copyinfo[]' to copy to/from guest linear address space starting
+ * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
+ * a copyin or PROT_WRITE for a copyout.
+ *
+ * retval is_fault Interpretation
+ * 0 0 Success
+ * 0 1 An exception was injected into the guest
+ * EFAULT N/A Unrecoverable error
+ *
+ * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
+ * the return value is 0. The 'copyinfo[]' resources should be freed by calling
+ * 'vm_copy_teardown()' after the copy is done.
+ */
+int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+ int num_copyinfo, int *is_fault);
+void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+ int num_copyinfo);
+void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+ void *kaddr, size_t len);
+void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+ struct vm_copyinfo *copyinfo, size_t len);
+
+int vcpu_trace_exceptions(struct vm *vm, int vcpuid);
+#endif /* KERNEL */
+
+#define VM_MAXCPU 32 /* maximum virtual cpus */
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+ VM_CAP_HALT_EXIT,
+ VM_CAP_MTRAP_EXIT,
+ VM_CAP_PAUSE_EXIT,
+ VM_CAP_UNRESTRICTED_GUEST,
+ VM_CAP_ENABLE_INVPCID,
+ VM_CAP_MAX
+};
+
+enum vm_intr_trigger {
+ EDGE_TRIGGER,
+ LEVEL_TRIGGER
+};
+
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+ uint64_t base;
+ uint32_t limit;
+ uint32_t access;
+};
+#define SEG_DESC_TYPE(access) ((access) & 0x001f)
+#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3)
+#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0)
+#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0)
+#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0)
+#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0)
+
+enum vm_cpu_mode {
+ CPU_MODE_REAL,
+ CPU_MODE_PROTECTED,
+ CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */
+ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */
+};
+
+enum vm_paging_mode {
+ PAGING_MODE_FLAT,
+ PAGING_MODE_32,
+ PAGING_MODE_PAE,
+ PAGING_MODE_64,
+};
+
+struct vm_guest_paging {
+ uint64_t cr3;
+ int cpl;
+ enum vm_cpu_mode cpu_mode;
+ enum vm_paging_mode paging_mode;
+};
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+ uint8_t op_byte; /* actual opcode byte */
+ uint8_t op_type; /* type of operation (e.g. MOV) */
+ uint16_t op_flags;
+};
+
+#define VIE_INST_SIZE 15
+struct vie {
+ uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
+ uint8_t num_valid; /* size of the instruction */
+ uint8_t num_processed;
+
+ uint8_t addrsize:4, opsize:4; /* address and operand sizes */
+ uint8_t rex_w:1, /* REX prefix */
+ rex_r:1,
+ rex_x:1,
+ rex_b:1,
+ rex_present:1,
+ repz_present:1, /* REP/REPE/REPZ prefix */
+ repnz_present:1, /* REPNE/REPNZ prefix */
+ opsize_override:1, /* Operand size override */
+ addrsize_override:1, /* Address size override */
+ segment_override:1; /* Segment override */
+
+ uint8_t mod:2, /* ModRM byte */
+ reg:4,
+ rm:4;
+
+ uint8_t ss:2, /* SIB byte */
+ index:4,
+ base:4;
+
+ uint8_t disp_bytes;
+ uint8_t imm_bytes;
+
+ uint8_t scale;
+ int base_register; /* VM_REG_GUEST_xyz */
+ int index_register; /* VM_REG_GUEST_xyz */
+ int segment_register; /* VM_REG_GUEST_xyz */
+
+ int64_t displacement; /* optional addr displacement */
+ int64_t immediate; /* optional immediate operand */
+
+ uint8_t decoded; /* set to 1 if successfully decoded */
+
+ struct vie_op op; /* opcode description */
+};
+
+enum vm_exitcode {
+ VM_EXITCODE_INOUT,
+ VM_EXITCODE_VMX,
+ VM_EXITCODE_BOGUS,
+ VM_EXITCODE_RDMSR,
+ VM_EXITCODE_WRMSR,
+ VM_EXITCODE_HLT,
+ VM_EXITCODE_MTRAP,
+ VM_EXITCODE_PAUSE,
+ VM_EXITCODE_PAGING,
+ VM_EXITCODE_INST_EMUL,
+ VM_EXITCODE_SPINUP_AP,
+ VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */
+ VM_EXITCODE_RUNBLOCK,
+ VM_EXITCODE_IOAPIC_EOI,
+ VM_EXITCODE_SUSPENDED,
+ VM_EXITCODE_INOUT_STR,
+ VM_EXITCODE_TASK_SWITCH,
+ VM_EXITCODE_MONITOR,
+ VM_EXITCODE_MWAIT,
+ VM_EXITCODE_SVM,
+ VM_EXITCODE_REQIDLE,
+ VM_EXITCODE_DEBUG,
+ VM_EXITCODE_VMINSN,
+#ifndef __FreeBSD__
+ VM_EXITCODE_HT,
+#endif
+ VM_EXITCODE_MAX
+};
+
+struct vm_inout {
+ uint16_t bytes:3; /* 1 or 2 or 4 */
+ uint16_t in:1;
+ uint16_t string:1;
+ uint16_t rep:1;
+ uint16_t port;
+ uint32_t eax; /* valid for out */
+};
+
+struct vm_inout_str {
+ struct vm_inout inout; /* must be the first element */
+ struct vm_guest_paging paging;
+ uint64_t rflags;
+ uint64_t cr0;
+ uint64_t index;
+ uint64_t count; /* rep=1 (%rcx), rep=0 (1) */
+ int addrsize;
+ enum vm_reg_name seg_name;
+ struct seg_desc seg_desc;
+};
+
+enum task_switch_reason {
+ TSR_CALL,
+ TSR_IRET,
+ TSR_JMP,
+ TSR_IDT_GATE, /* task gate in IDT */
+};
+
+struct vm_task_switch {
+ uint16_t tsssel; /* new TSS selector */
+ int ext; /* task switch due to external event */
+ uint32_t errcode;
+ int errcode_valid; /* push 'errcode' on the new stack */
+ enum task_switch_reason reason;
+ struct vm_guest_paging paging;
+};
+
+struct vm_exit {
+ enum vm_exitcode exitcode;
+ int inst_length; /* 0 means unknown */
+ uint64_t rip;
+ union {
+ struct vm_inout inout;
+ struct vm_inout_str inout_str;
+ struct {
+ uint64_t gpa;
+ int fault_type;
+ } paging;
+ struct {
+ uint64_t gpa;
+ uint64_t gla;
+ uint64_t cs_base;
+ int cs_d; /* CS.D */
+ struct vm_guest_paging paging;
+ struct vie vie;
+ } inst_emul;
+ /*
+ * VMX specific payload. Used when there is no "better"
+ * exitcode to represent the VM-exit.
+ */
+ struct {
+ int status; /* vmx inst status */
+ /*
+ * 'exit_reason' and 'exit_qualification' are valid
+ * only if 'status' is zero.
+ */
+ uint32_t exit_reason;
+ uint64_t exit_qualification;
+ /*
+ * 'inst_error' and 'inst_type' are valid
+ * only if 'status' is non-zero.
+ */
+ int inst_type;
+ int inst_error;
+ } vmx;
+ /*
+ * SVM specific payload.
+ */
+ struct {
+ uint64_t exitcode;
+ uint64_t exitinfo1;
+ uint64_t exitinfo2;
+ } svm;
+ struct {
+ uint32_t code; /* ecx value */
+ uint64_t wval;
+ } msr;
+ struct {
+ int vcpu;
+ uint64_t rip;
+ } spinup_ap;
+ struct {
+ uint64_t rflags;
+ uint64_t intr_status;
+ } hlt;
+ struct {
+ int vector;
+ } ioapic_eoi;
+ struct {
+ enum vm_suspend_how how;
+ } suspended;
+ struct vm_task_switch task_switch;
+ } u;
+};
+
+/* APIs to inject faults into the guest */
+void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
+ int errcode);
+
+static __inline void
+vm_inject_ud(void *vm, int vcpuid)
+{
+ vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
+}
+
+static __inline void
+vm_inject_gp(void *vm, int vcpuid)
+{
+ vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
+}
+
+static __inline void
+vm_inject_ac(void *vm, int vcpuid, int errcode)
+{
+ vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
+}
+
+static __inline void
+vm_inject_ss(void *vm, int vcpuid, int errcode)
+{
+ vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
+}
+
+void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
+
+int vm_restart_instruction(void *vm, int vcpuid);
+
+#ifndef __FreeBSD__
+#ifdef _KERNEL
+
+void vmm_sol_glue_init(void);
+void vmm_sol_glue_cleanup(void);
+
+int vmm_mod_load(void);
+int vmm_mod_unload(void);
+
+void vmm_call_trap(uint64_t);
+
+/*
+ * Because of tangled headers, these are mirrored by vmm_drv.h to present the
+ * interface to driver consumers.
+ */
+typedef int (*vmm_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *);
+typedef int (*vmm_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t);
+
+int vm_ioport_hook(struct vm *, uint_t, vmm_rmem_cb_t, vmm_wmem_cb_t, void *,
+ void **);
+void vm_ioport_unhook(struct vm *, void **);
+int vm_ioport_handle_hook(struct vm *, int, bool, int, int, uint32_t *);
+
+#endif /* _KERNEL */
+#endif /* __FreeBSD */
+
+#endif /* _VMM_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h
new file mode 100644
index 0000000000..dd87dcb0a6
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_dev.h
@@ -0,0 +1,520 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _VMM_DEV_H_
+#define _VMM_DEV_H_
+
+#include <machine/vmm.h>
+
+struct vm_memmap {
+ vm_paddr_t gpa;
+ int segid; /* memory segment */
+ vm_ooffset_t segoff; /* offset into memory segment */
+ size_t len; /* mmap length */
+ int prot; /* RWX */
+ int flags;
+};
+#define VM_MEMMAP_F_WIRED 0x01
+#define VM_MEMMAP_F_IOMMU 0x02
+
+#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL)
+struct vm_memseg {
+ int segid;
+ size_t len;
+ char name[SPECNAMELEN + 1];
+};
+
+struct vm_register {
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ uint64_t regval;
+};
+
+struct vm_seg_desc { /* data or code segment */
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ struct seg_desc desc;
+};
+
+struct vm_register_set {
+ int cpuid;
+ unsigned int count;
+ const int *regnums; /* enum vm_reg_name */
+ uint64_t *regvals;
+};
+
+struct vm_run {
+ int cpuid;
+ struct vm_exit vm_exit;
+};
+
+struct vm_exception {
+ int cpuid;
+ int vector;
+ uint32_t error_code;
+ int error_code_valid;
+ int restart_instruction;
+};
+
+struct vm_lapic_msi {
+ uint64_t msg;
+ uint64_t addr;
+};
+
+struct vm_lapic_irq {
+ int cpuid;
+ int vector;
+};
+
+struct vm_ioapic_irq {
+ int irq;
+};
+
+struct vm_isa_irq {
+ int atpic_irq;
+ int ioapic_irq;
+};
+
+struct vm_isa_irq_trigger {
+ int atpic_irq;
+ enum vm_intr_trigger trigger;
+};
+
+struct vm_capability {
+ int cpuid;
+ enum vm_cap_type captype;
+ int capval;
+ int allcpus;
+};
+
+#ifdef __FreeBSD__
+struct vm_pptdev {
+ int bus;
+ int slot;
+ int func;
+};
+
+struct vm_pptdev_mmio {
+ int bus;
+ int slot;
+ int func;
+ vm_paddr_t gpa;
+ vm_paddr_t hpa;
+ size_t len;
+};
+
+struct vm_pptdev_msi {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int numvec; /* 0 means disabled */
+ uint64_t msg;
+ uint64_t addr;
+};
+
+struct vm_pptdev_msix {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int idx;
+ uint64_t msg;
+ uint32_t vector_control;
+ uint64_t addr;
+};
+
+struct vm_pptdev_limits {
+ int bus;
+ int slot;
+ int func;
+ int msi_limit;
+ int msix_limit;
+};
+#else /* __FreeBSD__ */
+struct vm_pptdev {
+ int pptfd;
+};
+
+struct vm_pptdev_mmio {
+ int pptfd;
+ vm_paddr_t gpa;
+ vm_paddr_t hpa;
+ size_t len;
+};
+
+struct vm_pptdev_msi {
+ int vcpu;
+ int pptfd;
+ int numvec; /* 0 means disabled */
+ uint64_t msg;
+ uint64_t addr;
+};
+
+struct vm_pptdev_msix {
+ int vcpu;
+ int pptfd;
+ int idx;
+ uint64_t msg;
+ uint32_t vector_control;
+ uint64_t addr;
+};
+
+struct vm_pptdev_limits {
+ int pptfd;
+ int msi_limit;
+ int msix_limit;
+};
+#endif /* __FreeBSD__ */
+
+struct vm_nmi {
+ int cpuid;
+};
+
+#ifdef __FreeBSD__
+#define MAX_VM_STATS 64
+#else
+#define MAX_VM_STATS (64 + VM_MAXCPU)
+#endif
+
+struct vm_stats {
+ int cpuid; /* in */
+ int num_entries; /* out */
+ struct timeval tv;
+ uint64_t statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+ int index; /* in */
+ char desc[128]; /* out */
+};
+
+struct vm_x2apic {
+ int cpuid;
+ enum x2apic_state state;
+};
+
+struct vm_gpa_pte {
+ uint64_t gpa; /* in */
+ uint64_t pte[4]; /* out */
+ int ptenum;
+};
+
+struct vm_hpet_cap {
+ uint32_t capabilities; /* lower 32 bits of HPET capabilities */
+};
+
+struct vm_suspend {
+ enum vm_suspend_how how;
+};
+
+struct vm_gla2gpa {
+ int vcpuid; /* inputs */
+ int prot; /* PROT_READ or PROT_WRITE */
+ uint64_t gla;
+ struct vm_guest_paging paging;
+ int fault; /* outputs */
+ uint64_t gpa;
+};
+
+struct vm_activate_cpu {
+ int vcpuid;
+};
+
+struct vm_cpuset {
+ int which;
+ int cpusetsize;
+#ifndef _KERNEL
+ cpuset_t *cpus;
+#else
+ void *cpus;
+#endif
+};
+#define VM_ACTIVE_CPUS 0
+#define VM_SUSPENDED_CPUS 1
+#define VM_DEBUG_CPUS 2
+
+struct vm_intinfo {
+ int vcpuid;
+ uint64_t info1;
+ uint64_t info2;
+};
+
+struct vm_rtc_time {
+ time_t secs;
+};
+
+struct vm_rtc_data {
+ int offset;
+ uint8_t value;
+};
+
+#ifndef __FreeBSD__
+struct vm_devmem_offset {
+ int segid;
+ off_t offset;
+};
+#endif
+
+struct vm_cpu_topology {
+ uint16_t sockets;
+ uint16_t cores;
+ uint16_t threads;
+ uint16_t maxcpus;
+};
+
+enum {
+ /* general routines */
+ IOCNUM_ABIVERS = 0,
+ IOCNUM_RUN = 1,
+ IOCNUM_SET_CAPABILITY = 2,
+ IOCNUM_GET_CAPABILITY = 3,
+ IOCNUM_SUSPEND = 4,
+ IOCNUM_REINIT = 5,
+
+ /* memory apis */
+ IOCNUM_MAP_MEMORY = 10, /* deprecated */
+ IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */
+ IOCNUM_GET_GPA_PMAP = 12,
+ IOCNUM_GLA2GPA = 13,
+ IOCNUM_ALLOC_MEMSEG = 14,
+ IOCNUM_GET_MEMSEG = 15,
+ IOCNUM_MMAP_MEMSEG = 16,
+ IOCNUM_MMAP_GETNEXT = 17,
+ IOCNUM_GLA2GPA_NOFAULT = 18,
+
+ /* register/state accessors */
+ IOCNUM_SET_REGISTER = 20,
+ IOCNUM_GET_REGISTER = 21,
+ IOCNUM_SET_SEGMENT_DESCRIPTOR = 22,
+ IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
+ IOCNUM_SET_REGISTER_SET = 24,
+ IOCNUM_GET_REGISTER_SET = 25,
+
+ /* interrupt injection */
+ IOCNUM_GET_INTINFO = 28,
+ IOCNUM_SET_INTINFO = 29,
+ IOCNUM_INJECT_EXCEPTION = 30,
+ IOCNUM_LAPIC_IRQ = 31,
+ IOCNUM_INJECT_NMI = 32,
+ IOCNUM_IOAPIC_ASSERT_IRQ = 33,
+ IOCNUM_IOAPIC_DEASSERT_IRQ = 34,
+ IOCNUM_IOAPIC_PULSE_IRQ = 35,
+ IOCNUM_LAPIC_MSI = 36,
+ IOCNUM_LAPIC_LOCAL_IRQ = 37,
+ IOCNUM_IOAPIC_PINCOUNT = 38,
+ IOCNUM_RESTART_INSTRUCTION = 39,
+
+ /* PCI pass-thru */
+ IOCNUM_BIND_PPTDEV = 40,
+ IOCNUM_UNBIND_PPTDEV = 41,
+ IOCNUM_MAP_PPTDEV_MMIO = 42,
+ IOCNUM_PPTDEV_MSI = 43,
+ IOCNUM_PPTDEV_MSIX = 44,
+ IOCNUM_GET_PPTDEV_LIMITS = 45,
+
+ /* statistics */
+ IOCNUM_VM_STATS = 50,
+ IOCNUM_VM_STAT_DESC = 51,
+
+ /* kernel device state */
+ IOCNUM_SET_X2APIC_STATE = 60,
+ IOCNUM_GET_X2APIC_STATE = 61,
+ IOCNUM_GET_HPET_CAPABILITIES = 62,
+
+ /* CPU Topology */
+ IOCNUM_SET_TOPOLOGY = 63,
+ IOCNUM_GET_TOPOLOGY = 64,
+
+ /* legacy interrupt injection */
+ IOCNUM_ISA_ASSERT_IRQ = 80,
+ IOCNUM_ISA_DEASSERT_IRQ = 81,
+ IOCNUM_ISA_PULSE_IRQ = 82,
+ IOCNUM_ISA_SET_IRQ_TRIGGER = 83,
+
+ /* vm_cpuset */
+ IOCNUM_ACTIVATE_CPU = 90,
+ IOCNUM_GET_CPUSET = 91,
+ IOCNUM_SUSPEND_CPU = 92,
+ IOCNUM_RESUME_CPU = 93,
+
+ /* RTC */
+ IOCNUM_RTC_READ = 100,
+ IOCNUM_RTC_WRITE = 101,
+ IOCNUM_RTC_SETTIME = 102,
+ IOCNUM_RTC_GETTIME = 103,
+
+#ifndef __FreeBSD__
+ /* illumos-custom ioctls */
+ IOCNUM_DEVMEM_GETOFFSET = 256,
+ IOCNUM_WRLOCK_CYCLE = 257,
+#endif
+};
+
+#define VM_RUN \
+ _IOWR('v', IOCNUM_RUN, struct vm_run)
+#define VM_SUSPEND \
+ _IOW('v', IOCNUM_SUSPEND, struct vm_suspend)
+#define VM_REINIT \
+ _IO('v', IOCNUM_REINIT)
+#define VM_ALLOC_MEMSEG \
+ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg)
+#define VM_GET_MEMSEG \
+ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg)
+#define VM_MMAP_MEMSEG \
+ _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap)
+#define VM_MMAP_GETNEXT \
+ _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap)
+#define VM_SET_REGISTER \
+ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define VM_GET_REGISTER \
+ _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define VM_SET_SEGMENT_DESCRIPTOR \
+ _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_GET_SEGMENT_DESCRIPTOR \
+ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_SET_REGISTER_SET \
+ _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set)
+#define VM_GET_REGISTER_SET \
+ _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set)
+#define VM_INJECT_EXCEPTION \
+ _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception)
+#define VM_LAPIC_IRQ \
+ _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define VM_LAPIC_LOCAL_IRQ \
+ _IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq)
+#define VM_LAPIC_MSI \
+ _IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi)
+#define VM_IOAPIC_ASSERT_IRQ \
+ _IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq)
+#define VM_IOAPIC_DEASSERT_IRQ \
+ _IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq)
+#define VM_IOAPIC_PULSE_IRQ \
+ _IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq)
+#define VM_IOAPIC_PINCOUNT \
+ _IOR('v', IOCNUM_IOAPIC_PINCOUNT, int)
+#define VM_ISA_ASSERT_IRQ \
+ _IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq)
+#define VM_ISA_DEASSERT_IRQ \
+ _IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq)
+#define VM_ISA_PULSE_IRQ \
+ _IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq)
+#define VM_ISA_SET_IRQ_TRIGGER \
+ _IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger)
+#define VM_SET_CAPABILITY \
+ _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define VM_GET_CAPABILITY \
+ _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define VM_BIND_PPTDEV \
+ _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define VM_UNBIND_PPTDEV \
+ _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define VM_MAP_PPTDEV_MMIO \
+ _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define VM_PPTDEV_MSI \
+ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define VM_PPTDEV_MSIX \
+ _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
+#define VM_GET_PPTDEV_LIMITS \
+ _IOR('v', IOCNUM_GET_PPTDEV_LIMITS, struct vm_pptdev_limits)
+#define VM_INJECT_NMI \
+ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#define VM_STATS_IOC \
+ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#define VM_STAT_DESC \
+ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#define VM_SET_X2APIC_STATE \
+ _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
+#define VM_GET_X2APIC_STATE \
+ _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#define VM_GET_HPET_CAPABILITIES \
+ _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap)
+#define VM_SET_TOPOLOGY \
+ _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology)
+#define VM_GET_TOPOLOGY \
+ _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology)
+#define VM_GET_GPA_PMAP \
+ _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
+#define VM_GLA2GPA \
+ _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa)
+#define VM_GLA2GPA_NOFAULT \
+ _IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa)
+#define VM_ACTIVATE_CPU \
+ _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
+#define VM_GET_CPUS \
+ _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
+#define VM_SUSPEND_CPU \
+ _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu)
+#define VM_RESUME_CPU \
+ _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu)
+#define VM_SET_INTINFO \
+ _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
+#define VM_GET_INTINFO \
+ _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
+#define VM_RTC_WRITE \
+ _IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data)
+#define VM_RTC_READ \
+ _IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data)
+#define VM_RTC_SETTIME \
+ _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time)
+#define VM_RTC_GETTIME \
+ _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time)
+#define VM_RESTART_INSTRUCTION \
+ _IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
+
+#ifndef __FreeBSD__
+#define VM_DEVMEM_GETOFFSET \
+ _IOW('v', IOCNUM_DEVMEM_GETOFFSET, struct vm_devmem_offset)
+#define VM_WRLOCK_CYCLE _IO('v', IOCNUM_WRLOCK_CYCLE)
+
+/* ioctls used against ctl device for vm create/destroy */
+#define VMM_IOC_BASE (('V' << 16) | ('M' << 8))
+#define VMM_CREATE_VM (VMM_IOC_BASE | 0x01)
+#define VMM_DESTROY_VM (VMM_IOC_BASE | 0x02)
+#define VMM_VM_SUPPORTED (VMM_IOC_BASE | 0x03)
+
+#define VMM_CTL_DEV "/dev/vmmctl"
+
+#endif
+
+#endif
diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h
new file mode 100644
index 0000000000..856b75e5cc
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_drv.h
@@ -0,0 +1,53 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _VMM_DRV_H_
+#define _VMM_DRV_H_
+
+#ifdef _KERNEL
+
+#include <sys/file.h>
+
+struct vmm_hold;
+typedef struct vmm_hold vmm_hold_t;
+
+struct vmm_lease;
+typedef struct vmm_lease vmm_lease_t;
+
+/*
+ * Because of tangled headers, these definitions mirror their vmm_[rw]mem_cb_t
+ * counterparts in vmm.h.
+ */
+typedef int (*vmm_drv_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *);
+typedef int (*vmm_drv_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t);
+
+extern int vmm_drv_hold(file_t *, cred_t *, vmm_hold_t **);
+extern void vmm_drv_rele(vmm_hold_t *);
+extern boolean_t vmm_drv_release_reqd(vmm_hold_t *);
+
+extern vmm_lease_t *vmm_drv_lease_sign(vmm_hold_t *, boolean_t (*)(void *),
+ void *);
+extern void vmm_drv_lease_break(vmm_hold_t *, vmm_lease_t *);
+extern boolean_t vmm_drv_lease_expired(vmm_lease_t *);
+
+extern void *vmm_drv_gpa2kva(vmm_lease_t *, uintptr_t, size_t);
+extern int vmm_drv_msi(vmm_lease_t *, uint64_t, uint64_t);
+
+extern int vmm_drv_ioport_hook(vmm_hold_t *, uint_t, vmm_drv_rmem_cb_t,
+ vmm_drv_wmem_cb_t, void *, void **);
+extern void vmm_drv_ioport_unhook(vmm_hold_t *, void **);
+#endif /* _KERNEL */
+
+#endif /* _VMM_DRV_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h
new file mode 100644
index 0000000000..cdc56cc464
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_impl.h
@@ -0,0 +1,89 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _VMM_IMPL_H_
+#define _VMM_IMPL_H_
+
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/varargs.h>
+#include <sys/zone.h>
+
+#ifdef _KERNEL
+
+#define VMM_CTL_MINOR 0
+
+/*
+ * Rather than creating whole character devices for devmem mappings, they are
+ * available by mmap(2)ing the vmm handle at a specific offset. These offsets
+ * begin just above the maximum allow guest physical address.
+ */
+#include <vm/vm_param.h>
+#define VM_DEVMEM_START (VM_MAXUSER_ADDRESS + 1)
+
+struct vmm_devmem_entry {
+ list_node_t vde_node;
+ int vde_segid;
+ char vde_name[SPECNAMELEN + 1];
+ size_t vde_len;
+ off_t vde_off;
+};
+typedef struct vmm_devmem_entry vmm_devmem_entry_t;
+
+typedef struct vmm_zsd vmm_zsd_t;
+
+enum vmm_softc_state {
+ VMM_HELD = 1, /* external driver(s) possess hold on the VM */
+ VMM_CLEANUP = 2, /* request that holds are released */
+ VMM_PURGED = 4, /* all hold have been released */
+ VMM_BLOCK_HOOK = 8, /* mem hook install temporarily blocked */
+ VMM_DESTROY = 16 /* VM is destroyed, softc still around */
+};
+
+struct vmm_softc {
+ list_node_t vmm_node;
+ struct vm *vmm_vm;
+ minor_t vmm_minor;
+ char vmm_name[VM_MAX_NAMELEN];
+ list_t vmm_devmem_list;
+
+ kcondvar_t vmm_cv;
+ list_t vmm_holds;
+ uint_t vmm_flags;
+ boolean_t vmm_is_open;
+
+ kmutex_t vmm_lease_lock;
+ list_t vmm_lease_list;
+ uint_t vmm_lease_blocker;
+ kcondvar_t vmm_lease_cv;
+ krwlock_t vmm_rwlock;
+
+ /* For zone specific data */
+ list_node_t vmm_zsd_linkage;
+ zone_t *vmm_zone;
+ vmm_zsd_t *vmm_zsd;
+};
+typedef struct vmm_softc vmm_softc_t;
+
+void vmm_zsd_init(void);
+void vmm_zsd_fini(void);
+int vmm_zsd_add_vm(vmm_softc_t *sc);
+void vmm_zsd_rem_vm(vmm_softc_t *sc);
+int vmm_do_vm_destroy(vmm_softc_t *, boolean_t);
+
+#endif /* _KERNEL */
+
+#endif /* _VMM_IMPL_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h
new file mode 100644
index 0000000000..f10f407164
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h
@@ -0,0 +1,137 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_INSTRUCTION_EMUL_H_
+#define _VMM_INSTRUCTION_EMUL_H_
+
+#include <sys/mman.h>
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t mrr,
+ mem_region_write_t mrw, void *mrarg);
+
+int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+ uint64_t val, int size);
+
+/*
+ * Returns 1 if an alignment check exception should be injected and 0 otherwise.
+ */
+int vie_alignment_check(int cpl, int operand_size, uint64_t cr0,
+ uint64_t rflags, uint64_t gla);
+
+/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */
+int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
+
+uint64_t vie_size2mask(int size);
+
+int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
+ struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot,
+ uint64_t *gla);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ *
+ * 'vie' must be initialized before calling 'vmm_fetch_instruction()'
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+ struct vm_guest_paging *guest_paging,
+ uint64_t rip, int inst_length, struct vie *vie,
+ int *is_fault);
+
+/*
+ * Translate the guest linear address 'gla' to a guest physical address.
+ *
+ * retval is_fault Interpretation
+ * 0 0 'gpa' contains result of the translation
+ * 0 1 An exception was injected into the guest
+ * EFAULT N/A An unrecoverable hypervisor error occurred
+ */
+int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, int prot, uint64_t *gpa, int *is_fault);
+
+/*
+ * Like vm_gla2gpa, but no exceptions are injected into the guest and
+ * PTEs are not changed.
+ */
+int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, int prot, uint64_t *gpa, int *is_fault);
+
+void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);
+
+/*
+ * Decode the instruction fetched into 'vie' so it can be emulated.
+ *
+ * 'gla' is the guest linear address provided by the hardware assist
+ * that caused the nested page table fault. It is used to verify that
+ * the software instruction decoding is in agreement with the hardware.
+ *
+ * Some hardware assists do not provide the 'gla' to the hypervisor.
+ * To skip the 'gla' verification for this or any other reason pass
+ * in VIE_INVALID_GLA instead.
+ */
+#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */
+int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
+ enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);
+#endif /* _KERNEL */
+
+#endif /* _VMM_INSTRUCTION_EMUL_H_ */