diff options
Diffstat (limited to 'usr/src/uts/i86pc/sys')
| -rw-r--r-- | usr/src/uts/i86pc/sys/Makefile | 15 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/apic.h | 2 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/comm_page.h | 1 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/gipt.h | 92 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/hma.h | 39 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/machcpuvar.h | 8 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/machparam.h | 190 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/machsystm.h | 2 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/ppt_dev.h | 56 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/viona_io.h | 63 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/vm_machparam.h | 4 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/vmm.h | 748 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/vmm_dev.h | 520 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/vmm_drv.h | 53 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/vmm_impl.h | 89 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/vmm_instruction_emul.h | 137 |
16 files changed, 1877 insertions, 142 deletions
diff --git a/usr/src/uts/i86pc/sys/Makefile b/usr/src/uts/i86pc/sys/Makefile index 292cd04c2b..3d8332a930 100644 --- a/usr/src/uts/i86pc/sys/Makefile +++ b/usr/src/uts/i86pc/sys/Makefile @@ -37,7 +37,7 @@ include ../Makefile.i86pc # FILEMODE = 644 -HDRS= \ +CHKHDRS= \ acpidev.h \ amd_iommu.h \ asm_misc.h \ @@ -46,6 +46,7 @@ HDRS= \ ddi_subrdefs.h \ debug_info.h \ fastboot.h \ + hma.h \ mach_mmu.h \ machclock.h \ machcpuvar.h \ @@ -68,6 +69,16 @@ HDRS= \ xc_levels.h \ xsvc.h +NOCHKHDRS= \ + vmm.h \ + vmm_dev.h \ + vmm_impl.h \ + vmm_instruction_emul.h + +HDRS= \ + $(CHKHDRS) \ + $(NOCHKHDRS) + ROOTHDRS= $(HDRS:%=$(USR_PSM_ISYS_DIR)/%) ROOTDIR= $(ROOT)/usr/share/src @@ -76,7 +87,7 @@ ROOTDIRS= $(ROOTDIR)/uts $(ROOTDIR)/uts/$(PLATFORM) ROOTLINK= $(ROOTDIR)/uts/$(PLATFORM)/sys LINKDEST= ../../../../platform/$(PLATFORM)/include/sys -CHECKHDRS= $(HDRS:%.h=%.check) +CHECKHDRS= $(CHKHDRS:%.h=%.check) .KEEP_STATE: diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h index 26626ec5a4..f2528a632f 100644 --- a/usr/src/uts/i86pc/sys/apic.h +++ b/usr/src/uts/i86pc/sys/apic.h @@ -386,7 +386,7 @@ struct apic_io_intr { /* special or reserve vectors */ #define APIC_CHECK_RESERVE_VECTORS(v) \ (((v) == T_FASTTRAP) || ((v) == APIC_SPUR_INTR) || \ - ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET)) + ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET) || ((v) == 0x80)) /* cmos shutdown code for BIOS */ #define BIOS_SHUTDOWN 0x0a diff --git a/usr/src/uts/i86pc/sys/comm_page.h b/usr/src/uts/i86pc/sys/comm_page.h index 520ad9001d..ea19c856a8 100644 --- a/usr/src/uts/i86pc/sys/comm_page.h +++ b/usr/src/uts/i86pc/sys/comm_page.h @@ -27,6 +27,7 @@ extern "C" { #endif #define COMM_PAGE_SIZE PAGESIZE +#define COMM_PAGE_ALIGN 0x4000 #ifndef _ASM diff --git a/usr/src/uts/i86pc/sys/gipt.h b/usr/src/uts/i86pc/sys/gipt.h new file mode 100644 index 0000000000..4d7d523726 --- /dev/null +++ b/usr/src/uts/i86pc/sys/gipt.h @@ -0,0 +1,92 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _GIPT_H_ +#define _GIPT_H_ + +#include <sys/types.h> +#include <sys/mutex.h> +#include <sys/param.h> +#include <sys/list.h> + +struct gipt { + list_node_t gipt_node; + uint64_t gipt_vaddr; + uint64_t gipt_pfn; + uint16_t gipt_level; + uint16_t gipt_valid_cnt; + uint32_t _gipt_pad; + struct gipt *gipt_parent; + uint64_t *gipt_kva; + uint64_t _gipt_pad2; +}; +typedef struct gipt gipt_t; + +typedef enum { + PTET_EMPTY = 0, + PTET_PAGE = 1, + PTET_LINK = 2, +} gipt_pte_type_t; + +/* Given a PTE and its level, determine the type of that PTE */ +typedef gipt_pte_type_t (*gipt_pte_type_cb_t)(uint64_t, uint_t); +/* Given the PFN of a child table, emit a PTE that references it */ +typedef uint64_t (*gipt_pte_map_cb_t)(uint64_t); + +struct gipt_cbs { + gipt_pte_type_cb_t giptc_pte_type; + gipt_pte_map_cb_t giptc_pte_map; +}; + +struct gipt_map { + kmutex_t giptm_lock; + gipt_t *giptm_root; + list_t *giptm_hash; + struct gipt_cbs giptm_cbs; + size_t giptm_table_cnt; + uint_t giptm_levels; +}; +typedef struct gipt_map gipt_map_t; + +#define GIPT_HASH_SIZE_DEFAULT 0x2000 +#define GIPT_MAX_LEVELS 4 + +#define GIPT_VA2IDX(pt, va) \ + (((va) - (pt)->gipt_vaddr) >> \ + gipt_level_shift[(pt)->gipt_level]) + +#define GIPT_VA2PTE(pt, va) ((pt)->gipt_kva[GIPT_VA2IDX(pt, va)]) +#define GIPT_VA2PTEP(pt, va) (&(pt)->gipt_kva[GIPT_VA2IDX(pt, va)]) + +extern const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1]; + +extern gipt_t *gipt_alloc(void); +extern void gipt_free(gipt_t *); +extern void gipt_map_init(gipt_map_t *, uint_t, uint_t, + const struct gipt_cbs *, gipt_t *); +extern void gipt_map_fini(gipt_map_t *); +extern gipt_t *gipt_map_lookup(gipt_map_t *, uint64_t, uint_t); +extern gipt_t *gipt_map_lookup_deepest(gipt_map_t *, uint64_t); +extern uint64_t gipt_map_next_page(gipt_map_t *, uint64_t, uint64_t, + gipt_t **); +extern void gipt_map_insert(gipt_map_t *, gipt_t *); +extern void gipt_map_remove(gipt_map_t *, gipt_t *); +extern gipt_t *gipt_map_create_parents(gipt_map_t *, uint64_t, uint_t); +extern void gipt_map_clean_parents(gipt_map_t *, gipt_t *); + +#endif /* _GIPT_H_ */ diff --git a/usr/src/uts/i86pc/sys/hma.h b/usr/src/uts/i86pc/sys/hma.h index 00009cf439..16ab708896 100644 --- a/usr/src/uts/i86pc/sys/hma.h +++ b/usr/src/uts/i86pc/sys/hma.h @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_HMA_H @@ -30,6 +30,40 @@ extern "C" { #endif + +/* + * Register a hypervisor with HMA. On success, a pointer to the opaque + * registration token will be returned, indicating that proper host setup has + * occurred for further hypervisor actions. + */ +typedef struct hma_reg hma_reg_t; +extern hma_reg_t *hma_register(const char *); +extern hma_reg_t *hma_register_exclusive(const char *); +extern void hma_unregister(hma_reg_t *); + +/* + * Allocate or free a VPID for use with VMX. + * + * This must not be performed by a hypervisor until it has successfully + * registered via hma_register(). + */ +extern uint16_t hma_vmx_vpid_alloc(void); +extern void hma_vmx_vpid_free(uint16_t); + +/* + * On all active CPUs, perform a single-context INVEPT on the given EPTP. + */ +extern void hma_vmx_invept_allcpus(uintptr_t); + +struct hma_svm_asid { + uint64_t hsa_gen; + uint32_t hsa_asid; +}; +typedef struct hma_svm_asid hma_svm_asid_t; + +extern void hma_svm_asid_init(hma_svm_asid_t *); +extern uint8_t hma_svm_asid_update(hma_svm_asid_t *, boolean_t, boolean_t); + /* * FPU related management. These functions provide a set of APIs to manage the * FPU state and switch between host and guest management of this state. @@ -96,6 +130,9 @@ extern void hma_fpu_stop_guest(hma_fpu_t *); extern void hma_fpu_get_fxsave_state(const hma_fpu_t *, struct fxsave_state *); extern int hma_fpu_set_fxsave_state(hma_fpu_t *, const struct fxsave_state *); +/* Perform HMA initialization steps during boot-up. */ +extern void hma_init(void); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h index f4e38dec98..772f3112cb 100644 --- a/usr/src/uts/i86pc/sys/machcpuvar.h +++ b/usr/src/uts/i86pc/sys/machcpuvar.h @@ -81,6 +81,12 @@ struct xen_evt_data { ulong_t evt_affinity[sizeof (ulong_t) * 8]; /* service on cpu */ }; +enum fast_syscall_state { + FSS_DISABLED = 0, + FSS_ASYSC_ENABLED = (1 << 0), + FSS_SEP_ENABLED = (1 << 1) +}; + struct kpti_frame { uint64_t kf_lower_redzone; @@ -214,6 +220,8 @@ struct machcpu { uint16_t mcpu_idle_type; /* CPU next idle type */ uint16_t max_cstates; /* supported max cstates */ + enum fast_syscall_state mcpu_fast_syscall_state; + struct cpu_ucode_info *mcpu_ucode_info; void *mcpu_pm_mach_state; diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h index 51d7559483..f79b582df4 100644 --- a/usr/src/uts/i86pc/sys/machparam.h +++ b/usr/src/uts/i86pc/sys/machparam.h @@ -31,14 +31,15 @@ #ifndef _SYS_MACHPARAM_H #define _SYS_MACHPARAM_H -#if !defined(_ASM) +#ifndef _ASM + #include <sys/types.h> #if defined(__xpv) #include <sys/xpv_impl.h> #endif -#endif +#endif /* !_ASM */ #ifdef __cplusplus extern "C" { @@ -54,17 +55,12 @@ extern "C" { * Machine dependent parameters and limits. */ -#if defined(__amd64) /* * If NCPU grows beyond 256, sizing for the x86 comm page will require * adjustment. */ #define NCPU 256 #define NCPU_LOG2 8 -#elif defined(__i386) -#define NCPU 32 -#define NCPU_LOG2 5 -#endif /* NCPU_P2 is NCPU rounded to a power of 2 */ #define NCPU_P2 (1 << NCPU_LOG2) @@ -116,11 +112,7 @@ extern "C" { /* * DEFAULT KERNEL THREAD stack size (in pages). */ -#if defined(__amd64) #define DEFAULTSTKSZ_NPGS 5 -#elif defined(__i386) -#define DEFAULTSTKSZ_NPGS 3 -#endif #if !defined(_ASM) #define DEFAULTSTKSZ (DEFAULTSTKSZ_NPGS * PAGESIZE) @@ -129,43 +121,42 @@ extern "C" { #endif /* !_ASM */ /* - * KERNELBASE is the virtual address at which the kernel segments start in - * all contexts. - * - * KERNELBASE is not fixed. The value of KERNELBASE can change with - * installed memory or on 32 bit systems the eprom variable 'eprom_kernelbase'. - * - * common/conf/param.c requires a compile time defined value for KERNELBASE. - * This value is save in the variable _kernelbase. _kernelbase may then be - * modified with to a different value in i86pc/os/startup.c. - * - * Most code should be using kernelbase, which resolves to a reference to - * _kernelbase. + * During intial boot we limit heap to the top 4Gig. */ -#define KERNEL_TEXT_amd64 UINT64_C(0xfffffffffb800000) - -#ifdef __i386 - -#define KERNEL_TEXT_i386 ADDRESS_C(0xfe800000) +#define BOOT_KERNELHEAP_BASE ADDRESS_C(0xffffffff00000000) /* - * We don't use HYPERVISOR_VIRT_START, as we need both the PAE and non-PAE - * versions in our code. We always compile based on the lower PAE address. + * VMWare works best if we don't use the top 64Meg of memory for amd64. + * Set KERNEL_TEXT to top_o_memory - 64Meg - 8 Meg for 8Meg of nucleus pages. */ -#define KERNEL_TEXT_i386_xpv \ - (HYPERVISOR_VIRT_START_PAE - 3 * ADDRESS_C(0x400000)) - -#endif /* __i386 */ +#define PROMSTART ADDRESS_C(0xffc00000) -#if defined(__amd64) +/* + * Virtual address range available to the debugger + */ +#define SEGDEBUGBASE ADDRESS_C(0xffffffffff800000) +#define SEGDEBUGSIZE ADDRESS_C(0x400000) -#define KERNELBASE ADDRESS_C(0xfffffd8000000000) +#define KERNEL_TEXT UINT64_C(0xfffffffffb800000) /* - * Size of the unmapped "red zone" at the very bottom of the kernel's - * address space. Corresponds to 1 slot in the toplevel pagetable. + * Reserve pages just below KERNEL_TEXT for the GDT, IDT, LDT, TSS and debug + * info. + * + * For now, DEBUG_INFO_VA must be first in this list for "xm" initiated dumps + * of solaris domUs to be usable with mdb. Relying on a fixed VA is not viable + * long term, but it's the best we've got for now. */ -#define KERNEL_REDZONE_SIZE ((uintptr_t)1 << 39) +#if !defined(_ASM) +#define DEBUG_INFO_VA (KERNEL_TEXT - MMU_PAGESIZE) +#define GDT_VA (DEBUG_INFO_VA - MMU_PAGESIZE) +#define IDT_VA (GDT_VA - MMU_PAGESIZE) +#define LDT_VA (IDT_VA - (16 * MMU_PAGESIZE)) +#define KTSS_VA (LDT_VA - MMU_PAGESIZE) +#define DFTSS_VA (KTSS_VA - MMU_PAGESIZE) +#define MISC_VA_BASE (DFTSS_VA) +#define MISC_VA_SIZE (KERNEL_TEXT - MISC_VA_BASE) +#endif /* !_ASM */ /* * Base of 'core' heap area, which is used for kernel and module text/data @@ -174,52 +165,47 @@ extern "C" { #define COREHEAP_BASE ADDRESS_C(0xffffffffc0000000) /* - * Beginning of the segkpm window. A lower value than this is used if - * physical addresses exceed 1TB. See i86pc/os/startup.c - */ -#define SEGKPM_BASE ADDRESS_C(0xfffffe0000000000) - -/* * This is valloc_base, above seg_kpm, but below everything else. * A lower value than this may be used if SEGKPM_BASE is adjusted. * See i86pc/os/startup.c */ -#define VALLOC_BASE ADDRESS_C(0xffffff0000000000) +#define VALLOC_BASE ADDRESS_C(0xfffffe0000000000) + +#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */ +#define SEGVMMMINSIZE (4096L * 1024 * 1024L) /* 4G */ -/* - * default and boundary sizes for segkp - */ #define SEGKPDEFSIZE (2L * 1024L * 1024L * 1024L) /* 2G */ #define SEGKPMAXSIZE (8L * 1024L * 1024L * 1024L) /* 8G */ #define SEGKPMINSIZE (200L * 1024 * 1024L) /* 200M */ -/* - * minimum size for segzio - */ -#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */ - -/* - * During intial boot we limit heap to the top 4Gig. - */ -#define BOOT_KERNELHEAP_BASE ADDRESS_C(0xffffffff00000000) +#define SEGKPM_BASE ADDRESS_C(0xfffffd0000000000) /* - * VMWare works best if we don't use the top 64Meg of memory for amd64. - * Set KERNEL_TEXT to top_o_memory - 64Meg - 8 Meg for 8Meg of nucleus pages. + * KERNELBASE is the virtual address at which the kernel segments start in + * all contexts. + * + * KERNELBASE is not fixed. The value of KERNELBASE can change with + * installed memory size. + * + * common/conf/param.c requires a compile time defined value for KERNELBASE. + * This value is save in the variable _kernelbase. _kernelbase may then be + * modified with to a different value in i86pc/os/startup.c. + * + * Most code should be using kernelbase, which resolves to a reference to + * _kernelbase. */ -#define PROMSTART ADDRESS_C(0xffc00000) -#define KERNEL_TEXT KERNEL_TEXT_amd64 +#define KERNELBASE ADDRESS_C(0xfffffc8000000000) /* - * Virtual address range available to the debugger + * Size of the unmapped "red zone" at the very bottom of the kernel's + * address space. Corresponds to 1 slot in the toplevel pagetable. */ -#define SEGDEBUGBASE ADDRESS_C(0xffffffffff800000) -#define SEGDEBUGSIZE ADDRESS_C(0x400000) +#define KERNEL_REDZONE_SIZE ((uintptr_t)1 << 39) /* * Define upper limit on user address space * - * In amd64, the upper limit on a 64-bit user address space is 1 large page + * The upper limit on a 64-bit user address space is 1 large page * (2MB) below kernelbase. The upper limit for a 32-bit user address space * is 1 small page (4KB) below the top of the 32-bit range. The 64-bit * limit give dtrace the red zone it needs below kernelbase. The 32-bit @@ -232,7 +218,7 @@ extern "C" { #if defined(__xpv) #define USERLIMIT ADDRESS_C(0x00007fffffe00000) #else -#define USERLIMIT ADDRESS_C(0xfffffd7fffe00000) +#define USERLIMIT ADDRESS_C(0xfffffc7fffe00000) #endif #ifdef bug_5074717_is_fixed @@ -241,76 +227,6 @@ extern "C" { #define USERLIMIT32 ADDRESS_C(0xfefff000) #endif -#elif defined(__i386) - -#ifdef DEBUG -#define KERNELBASE ADDRESS_C(0xc8000000) -#else -#define KERNELBASE ADDRESS_C(0xd4000000) -#endif - -#define KERNELBASE_MAX ADDRESS_C(0xe0000000) - -/* - * The i386 ABI requires that the user address space be at least 3Gb - * in size. KERNELBASE_ABI_MIN is used as the default KERNELBASE for - * physical memory configurations > 4gb. - */ -#define KERNELBASE_ABI_MIN ADDRESS_C(0xc0000000) - -/* - * Size of the unmapped "red zone" at the very bottom of the kernel's - * address space. Since segmap start immediately above the red zone, this - * needs to be MAXBSIZE aligned. - */ -#define KERNEL_REDZONE_SIZE MAXBSIZE - -/* - * This is the last 4MB of the 4G address space. Some psm modules - * need this region of virtual address space mapped 1-1 - * The top 64MB of the address space is reserved for the hypervisor. - */ -#define PROMSTART ADDRESS_C(0xffc00000) -#ifdef __xpv -#define KERNEL_TEXT KERNEL_TEXT_i386_xpv -#else -#define KERNEL_TEXT KERNEL_TEXT_i386 -#endif - -/* - * Virtual address range available to the debugger - * We place it just above the kernel text (4M) and kernel data (4M). - */ -#define SEGDEBUGBASE (KERNEL_TEXT + ADDRESS_C(0x800000)) -#define SEGDEBUGSIZE ADDRESS_C(0x400000) - -/* - * Define upper limit on user address space - */ -#define USERLIMIT KERNELBASE -#define USERLIMIT32 USERLIMIT - -#endif /* __i386 */ - -/* - * Reserve pages just below KERNEL_TEXT for the GDT, IDT, LDT, TSS and debug - * info. - * - * For now, DEBUG_INFO_VA must be first in this list for "xm" initiated dumps - * of solaris domUs to be usable with mdb. Relying on a fixed VA is not viable - * long term, but it's the best we've got for now. - */ -#if !defined(_ASM) -#define DEBUG_INFO_VA (KERNEL_TEXT - MMU_PAGESIZE) -#define GDT_VA (DEBUG_INFO_VA - MMU_PAGESIZE) -#define IDT_VA (GDT_VA - MMU_PAGESIZE) -#define LDT_VA (IDT_VA - (16 * MMU_PAGESIZE)) -#define KTSS_VA (LDT_VA - MMU_PAGESIZE) -#define DFTSS_VA (KTSS_VA - MMU_PAGESIZE) -#define MISC_VA_BASE (DFTSS_VA) -#define MISC_VA_SIZE (KERNEL_TEXT - MISC_VA_BASE) -#endif /* !_ASM */ - #if !defined(_ASM) && !defined(_KMDB) extern uintptr_t kernelbase, segmap_start, segmapsize; #endif diff --git a/usr/src/uts/i86pc/sys/machsystm.h b/usr/src/uts/i86pc/sys/machsystm.h index 7409c5af4a..5f286ca4c6 100644 --- a/usr/src/uts/i86pc/sys/machsystm.h +++ b/usr/src/uts/i86pc/sys/machsystm.h @@ -25,6 +25,7 @@ /* * Copyright (c) 2010, Intel Corporation. * All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_MACHSYSTM_H @@ -231,6 +232,7 @@ extern page_t *page_get_high_mfn(mfn_t); #endif extern hrtime_t tsc_gethrtime_tick_delta(void); +extern hrtime_t tsc_gethrtime_params(uint64_t *, uint32_t *, uint8_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/i86pc/sys/ppt_dev.h b/usr/src/uts/i86pc/sys/ppt_dev.h new file mode 100644 index 0000000000..e25f941f14 --- /dev/null +++ b/usr/src/uts/i86pc/sys/ppt_dev.h @@ -0,0 +1,56 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc + */ + +#ifndef _PPT_DEV_H +#define _PPT_DEV_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define PPT_IOC (('P' << 16)|('T' << 8)) + +#define PPT_CFG_READ (PPT_IOC | 0x01) +#define PPT_CFG_WRITE (PPT_IOC | 0x02) +#define PPT_BAR_QUERY (PPT_IOC | 0x03) +#define PPT_BAR_READ (PPT_IOC | 0x04) +#define PPT_BAR_WRITE (PPT_IOC | 0x05) + +#define PPT_MAXNAMELEN 32 + +struct ppt_cfg_io { + uint64_t pci_off; + uint32_t pci_width; + uint32_t pci_data; +}; +struct ppt_bar_io { + uint32_t pbi_bar; + uint32_t pbi_off; + uint32_t pbi_width; + uint32_t pbi_data; +}; + +struct ppt_bar_query { + uint32_t pbq_baridx; + uint32_t pbq_type; + uint64_t pbq_base; + uint64_t pbq_size; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _PPT_DEV_H */ diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h new file mode 100644 index 0000000000..46cc72eb06 --- /dev/null +++ b/usr/src/uts/i86pc/sys/viona_io.h @@ -0,0 +1,63 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VIONA_IO_H_ +#define _VIONA_IO_H_ + +#define VNA_IOC (('V' << 16)|('C' << 8)) +#define VNA_IOC_CREATE (VNA_IOC | 0x01) +#define VNA_IOC_DELETE (VNA_IOC | 0x02) + +#define VNA_IOC_RING_INIT (VNA_IOC | 0x10) +#define VNA_IOC_RING_RESET (VNA_IOC | 0x11) +#define VNA_IOC_RING_KICK (VNA_IOC | 0x12) +#define VNA_IOC_RING_SET_MSI (VNA_IOC | 0x13) +#define VNA_IOC_RING_INTR_CLR (VNA_IOC | 0x14) + +#define VNA_IOC_INTR_POLL (VNA_IOC | 0x20) +#define VNA_IOC_SET_FEATURES (VNA_IOC | 0x21) +#define VNA_IOC_GET_FEATURES (VNA_IOC | 0x22) +#define VNA_IOC_SET_NOTIFY_IOP (VNA_IOC | 0x23) + +typedef struct vioc_create { + datalink_id_t c_linkid; + int c_vmfd; +} vioc_create_t; + +typedef struct vioc_ring_init { + uint16_t ri_index; + uint16_t ri_qsize; + uint64_t ri_qaddr; +} vioc_ring_init_t; + +typedef struct vioc_ring_msi { + uint16_t rm_index; + uint64_t rm_addr; + uint64_t rm_msg; +} vioc_ring_msi_t; + +enum viona_vq_id { + VIONA_VQ_RX = 0, + VIONA_VQ_TX = 1, + VIONA_VQ_MAX = 2 +}; + +typedef struct vioc_intr_poll { + uint32_t vip_status[VIONA_VQ_MAX]; +} vioc_intr_poll_t; + + +#endif /* _VIONA_IO_H_ */ diff --git a/usr/src/uts/i86pc/sys/vm_machparam.h b/usr/src/uts/i86pc/sys/vm_machparam.h index 90a5245217..fde81e59ed 100644 --- a/usr/src/uts/i86pc/sys/vm_machparam.h +++ b/usr/src/uts/i86pc/sys/vm_machparam.h @@ -23,6 +23,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_VM_MACHPARAM_H @@ -133,7 +134,8 @@ extern "C" { /* * The maximum value for handspreadpages which is the the distance - * between the two clock hands in pages. + * between the two clock hands in pages. This is only used when the page + * scanner is first started. */ #define MAXHANDSPREADPAGES ((64 * 1024 * 1024) / PAGESIZE) diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h new file mode 100644 index 0000000000..ac8f14b042 --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -0,0 +1,748 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#include <sys/sdt.h> +#include <x86/segments.h> + +#ifdef _KERNEL +SDT_PROVIDER_DECLARE(vmm); +#endif + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, + VM_SUSPEND_LAST +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_DR7, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + VM_REG_GUEST_LDTR, + VM_REG_GUEST_TR, + VM_REG_GUEST_IDTR, + VM_REG_GUEST_GDTR, + VM_REG_GUEST_EFER, + VM_REG_GUEST_CR2, + VM_REG_GUEST_PDPTE0, + VM_REG_GUEST_PDPTE1, + VM_REG_GUEST_PDPTE2, + VM_REG_GUEST_PDPTE3, + VM_REG_GUEST_INTR_SHADOW, + VM_REG_GUEST_DR0, + VM_REG_GUEST_DR1, + VM_REG_GUEST_DR2, + VM_REG_GUEST_DR3, + VM_REG_GUEST_DR6, + VM_REG_LAST +}; + +enum x2apic_state { + X2APIC_DISABLED, + X2APIC_ENABLED, + X2APIC_STATE_LAST +}; + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + + +#define VM_MAX_NAMELEN 32 + +#ifdef _KERNEL + +struct vm; +struct vm_exception; +struct seg_desc; +struct vm_exit; +struct vm_run; +struct vhpet; +struct vioapic; +struct vlapic; +struct vmspace; +struct vm_object; +struct vm_guest_paging; +struct pmap; + +struct vm_eventinfo { + u_int *rptr; /* runblock cookie */ + int *sptr; /* suspend cookie */ + int *iptr; /* reqidle cookie */ +}; + +typedef int (*vmm_init_func_t)(int ipinum); +typedef int (*vmm_cleanup_func_t)(void); +typedef void (*vmm_resume_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct pmap *pmap, struct vm_eventinfo *info); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); +typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); +typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); +typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); +#ifndef __FreeBSD__ +typedef void (*vmi_savectx)(void *vmi, int vcpu); +typedef void (*vmi_restorectx)(void *vmi, int vcpu); +#endif + +struct vmm_ops { + vmm_init_func_t init; /* module wide initialization */ + vmm_cleanup_func_t cleanup; + vmm_resume_func_t resume; + + vmi_init_func_t vminit; /* vm-specific initialization */ + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_desc_t vmgetdesc; + vmi_set_desc_t vmsetdesc; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; + vmi_vmspace_alloc vmspace_alloc; + vmi_vmspace_free vmspace_free; + vmi_vlapic_init vlapic_init; + vmi_vlapic_cleanup vlapic_cleanup; + +#ifndef __FreeBSD__ + vmi_savectx vmsavectx; + vmi_restorectx vmrestorectx; +#endif +}; + +extern struct vmm_ops vmm_ops_intel; +extern struct vmm_ops vmm_ops_amd; + +int vm_create(const char *name, struct vm **retvm); +void vm_destroy(struct vm *vm); +int vm_reinit(struct vm *vm); +const char *vm_name(struct vm *vm); +uint16_t vm_get_maxcpus(struct vm *vm); +void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus); +int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus); + +/* + * APIs that modify the guest memory map require all vcpus to be frozen. + */ +int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot, int flags); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +void vm_free_memseg(struct vm *vm, int ident); +int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); +#ifdef __FreeBSD__ +int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); +int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); +#else +int vm_assign_pptdev(struct vm *vm, int pptfd); +int vm_unassign_pptdev(struct vm *vm, int pptfd); +#endif /* __FreeBSD__ */ + +/* + * APIs that inspect the guest memory map require only a *single* vcpu to + * be frozen. This acts like a read lock on the guest memory map since any + * modification requires *all* vcpus to be frozen. + */ +int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + struct vm_object **objptr); +vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); +void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, + int prot, void **cookie); +void vm_gpa_release(void *cookie); +bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); + +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *ret_desc); +int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc); +int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_suspend(struct vm *vm, enum vm_suspend_how how); +int vm_inject_nmi(struct vm *vm, int vcpu); +int vm_nmi_pending(struct vm *vm, int vcpuid); +void vm_nmi_clear(struct vm *vm, int vcpuid); +int vm_inject_extint(struct vm *vm, int vcpu); +int vm_extint_pending(struct vm *vm, int vcpuid); +void vm_extint_clear(struct vm *vm, int vcpuid); +struct vlapic *vm_lapic(struct vm *vm, int cpu); +struct vioapic *vm_ioapic(struct vm *vm); +struct vhpet *vm_hpet(struct vm *vm); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); +int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); +int vm_apicid2vcpuid(struct vm *vm, int apicid); +int vm_activate_cpu(struct vm *vm, int vcpu); +int vm_suspend_cpu(struct vm *vm, int vcpu); +int vm_resume_cpu(struct vm *vm, int vcpu); +struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); + +#ifdef _SYS__CPUSET_H_ +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_debug_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); +#endif /* _SYS__CPUSET_H_ */ + +static __inline int +vcpu_runblocked(struct vm_eventinfo *info) +{ + + return (*info->rptr != 0); +} + +static __inline int +vcpu_suspended(struct vm_eventinfo *info) +{ + + return (*info->sptr); +} + +static __inline int +vcpu_reqidle(struct vm_eventinfo *info) +{ + + return (*info->iptr); +} + +int vcpu_debugged(struct vm *vm, int vcpuid); + +/* + * Return 1 if device indicated by bus/slot/func is supposed to be a + * pci passthrough device. + * + * Return 0 otherwise. + */ +int vmm_is_pptdev(int bus, int slot, int func); + +void *vm_iommu_domain(struct vm *vm); + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, + bool from_idle); +enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); +void vcpu_block_run(struct vm *, int); +void vcpu_unblock_run(struct vm *, int); + +#ifndef __FreeBSD__ +uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid); +#endif + +static __inline int +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +#ifdef _SYS_THREAD_H +static __inline int +vcpu_should_yield(struct vm *vm, int vcpu) +{ + + if (curthread->t_astflag) + return (1); + else if (CPU->cpu_runrun) + return (1); + else + return (0); +} +#endif /* _SYS_THREAD_H */ + +void *vcpu_stats(struct vm *vm, int vcpu); +void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); +struct vmspace *vm_get_vmspace(struct vm *vm); +struct vatpic *vm_atpic(struct vm *vm); +struct vatpit *vm_atpit(struct vm *vm); +struct vpmtmr *vm_pmtmr(struct vm *vm); +struct vrtc *vm_rtc(struct vm *vm); + +/* + * Inject exception 'vector' into the guest vcpu. This function returns 0 on + * success and non-zero on failure. + * + * Wrapper functions like 'vm_inject_gp()' should be preferred to calling + * this function directly because they enforce the trap-like or fault-like + * behavior of an exception. + * + * This function should only be called in the context of the thread that is + * executing this vcpu. + */ +int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, + uint32_t errcode, int restart_instruction); + +/* + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. + * + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); + +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +/* + * Set up 'copyinfo[]' to copy to/from guest linear address space starting + * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for + * a copyin or PROT_WRITE for a copyout. + * + * retval is_fault Interpretation + * 0 0 Success + * 0 1 An exception was injected into the guest + * EFAULT N/A Unrecoverable error + * + * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if + * the return value is 0. The 'copyinfo[]' resources should be freed by calling + * 'vm_copy_teardown()' after the copy is done. + */ +int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo, int *is_fault); +void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo); +void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + void *kaddr, size_t len); +void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len); + +int vcpu_trace_exceptions(struct vm *vm, int vcpuid); +#endif /* KERNEL */ + +#define VM_MAXCPU 32 /* maximum virtual cpus */ + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_ENABLE_INVPCID, + VM_CAP_MAX +}; + +enum vm_intr_trigger { + EDGE_TRIGGER, + LEVEL_TRIGGER +}; + +/* + * The 'access' field has the format specified in Table 21-2 of the Intel + * Architecture Manual vol 3b. + * + * XXX The contents of the 'access' field are architecturally defined except + * bit 16 - Segment Unusable. + */ +struct seg_desc { + uint64_t base; + uint32_t limit; + uint32_t access; +}; +#define SEG_DESC_TYPE(access) ((access) & 0x001f) +#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) +#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) +#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) +#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) +#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) + +enum vm_cpu_mode { + CPU_MODE_REAL, + CPU_MODE_PROTECTED, + CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ + CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ +}; + +enum vm_paging_mode { + PAGING_MODE_FLAT, + PAGING_MODE_32, + PAGING_MODE_PAE, + PAGING_MODE_64, +}; + +struct vm_guest_paging { + uint64_t cr3; + int cpl; + enum vm_cpu_mode cpu_mode; + enum vm_paging_mode paging_mode; +}; + +/* + * The data structures 'vie' and 'vie_op' are meant to be opaque to the + * consumers of instruction decoding. The only reason why their contents + * need to be exposed is because they are part of the 'vm_exit' structure. + */ +struct vie_op { + uint8_t op_byte; /* actual opcode byte */ + uint8_t op_type; /* type of operation (e.g. MOV) */ + uint16_t op_flags; +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ + uint8_t num_valid; /* size of the instruction */ + uint8_t num_processed; + + uint8_t addrsize:4, opsize:4; /* address and operand sizes */ + uint8_t rex_w:1, /* REX prefix */ + rex_r:1, + rex_x:1, + rex_b:1, + rex_present:1, + repz_present:1, /* REP/REPE/REPZ prefix */ + repnz_present:1, /* REPNE/REPNZ prefix */ + opsize_override:1, /* Operand size override */ + addrsize_override:1, /* Address size override */ + segment_override:1; /* Segment override */ + + uint8_t mod:2, /* ModRM byte */ + reg:4, + rm:4; + + uint8_t ss:2, /* SIB byte */ + index:4, + base:4; + + uint8_t disp_bytes; + uint8_t imm_bytes; + + uint8_t scale; + int base_register; /* VM_REG_GUEST_xyz */ + int index_register; /* VM_REG_GUEST_xyz */ + int segment_register; /* VM_REG_GUEST_xyz */ + + int64_t displacement; /* optional addr displacement */ + int64_t immediate; /* optional immediate operand */ + + uint8_t decoded; /* set to 1 if successfully decoded */ + + struct vie_op op; /* opcode description */ +}; + +enum vm_exitcode { + VM_EXITCODE_INOUT, + VM_EXITCODE_VMX, + VM_EXITCODE_BOGUS, + VM_EXITCODE_RDMSR, + VM_EXITCODE_WRMSR, + VM_EXITCODE_HLT, + VM_EXITCODE_MTRAP, + VM_EXITCODE_PAUSE, + VM_EXITCODE_PAGING, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_SPINUP_AP, + VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ + VM_EXITCODE_RUNBLOCK, + VM_EXITCODE_IOAPIC_EOI, + VM_EXITCODE_SUSPENDED, + VM_EXITCODE_INOUT_STR, + VM_EXITCODE_TASK_SWITCH, + VM_EXITCODE_MONITOR, + VM_EXITCODE_MWAIT, + VM_EXITCODE_SVM, + VM_EXITCODE_REQIDLE, + VM_EXITCODE_DEBUG, + VM_EXITCODE_VMINSN, +#ifndef __FreeBSD__ + VM_EXITCODE_HT, +#endif + VM_EXITCODE_MAX +}; + +struct vm_inout { + uint16_t bytes:3; /* 1 or 2 or 4 */ + uint16_t in:1; + uint16_t string:1; + uint16_t rep:1; + uint16_t port; + uint32_t eax; /* valid for out */ +}; + +struct vm_inout_str { + struct vm_inout inout; /* must be the first element */ + struct vm_guest_paging paging; + uint64_t rflags; + uint64_t cr0; + uint64_t index; + uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ + int addrsize; + enum vm_reg_name seg_name; + struct seg_desc seg_desc; +}; + +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; + struct vm_guest_paging paging; +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; /* 0 means unknown */ + uint64_t rip; + union { + struct vm_inout inout; + struct vm_inout_str inout_str; + struct { + uint64_t gpa; + int fault_type; + } paging; + struct { + uint64_t gpa; + uint64_t gla; + uint64_t cs_base; + int cs_d; /* CS.D */ + struct vm_guest_paging paging; + struct vie vie; + } inst_emul; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int status; /* vmx inst status */ + /* + * 'exit_reason' and 'exit_qualification' are valid + * only if 'status' is zero. + */ + uint32_t exit_reason; + uint64_t exit_qualification; + /* + * 'inst_error' and 'inst_type' are valid + * only if 'status' is non-zero. + */ + int inst_type; + int inst_error; + } vmx; + /* + * SVM specific payload. + */ + struct { + uint64_t exitcode; + uint64_t exitinfo1; + uint64_t exitinfo2; + } svm; + struct { + uint32_t code; /* ecx value */ + uint64_t wval; + } msr; + struct { + int vcpu; + uint64_t rip; + } spinup_ap; + struct { + uint64_t rflags; + uint64_t intr_status; + } hlt; + struct { + int vector; + } ioapic_eoi; + struct { + enum vm_suspend_how how; + } suspended; + struct vm_task_switch task_switch; + } u; +}; + +/* APIs to inject faults into the guest */ +void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, + int errcode); + +static __inline void +vm_inject_ud(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); +} + +static __inline void +vm_inject_gp(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); +} + +static __inline void +vm_inject_ac(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); +} + +static __inline void +vm_inject_ss(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); +} + +void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); + +int vm_restart_instruction(void *vm, int vcpuid); + +#ifndef __FreeBSD__ +#ifdef _KERNEL + +void vmm_sol_glue_init(void); +void vmm_sol_glue_cleanup(void); + +int vmm_mod_load(void); +int vmm_mod_unload(void); + +void vmm_call_trap(uint64_t); + +/* + * Because of tangled headers, these are mirrored by vmm_drv.h to present the + * interface to driver consumers. + */ +typedef int (*vmm_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *); +typedef int (*vmm_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t); + +int vm_ioport_hook(struct vm *, uint_t, vmm_rmem_cb_t, vmm_wmem_cb_t, void *, + void **); +void vm_ioport_unhook(struct vm *, void **); +int vm_ioport_handle_hook(struct vm *, int, bool, int, int, uint32_t *); + +#endif /* _KERNEL */ +#endif /* __FreeBSD */ + +#endif /* _VMM_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h new file mode 100644 index 0000000000..dd87dcb0a6 --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -0,0 +1,520 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#include <machine/vmm.h> + +struct vm_memmap { + vm_paddr_t gpa; + int segid; /* memory segment */ + vm_ooffset_t segoff; /* offset into memory segment */ + size_t len; /* mmap length */ + int prot; /* RWX */ + int flags; +}; +#define VM_MEMMAP_F_WIRED 0x01 +#define VM_MEMMAP_F_IOMMU 0x02 + +#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) +struct vm_memseg { + int segid; + size_t len; + char name[SPECNAMELEN + 1]; +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_seg_desc { /* data or code segment */ + int cpuid; + int regnum; /* enum vm_reg_name */ + struct seg_desc desc; +}; + +struct vm_register_set { + int cpuid; + unsigned int count; + const int *regnums; /* enum vm_reg_name */ + uint64_t *regvals; +}; + +struct vm_run { + int cpuid; + struct vm_exit vm_exit; +}; + +struct vm_exception { + int cpuid; + int vector; + uint32_t error_code; + int error_code_valid; + int restart_instruction; +}; + +struct vm_lapic_msi { + uint64_t msg; + uint64_t addr; +}; + +struct vm_lapic_irq { + int cpuid; + int vector; +}; + +struct vm_ioapic_irq { + int irq; +}; + +struct vm_isa_irq { + int atpic_irq; + int ioapic_irq; +}; + +struct vm_isa_irq_trigger { + int atpic_irq; + enum vm_intr_trigger trigger; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +#ifdef __FreeBSD__ +struct vm_pptdev { + int bus; + int slot; + int func; +}; + +struct vm_pptdev_mmio { + int bus; + int slot; + int func; + vm_paddr_t gpa; + vm_paddr_t hpa; + size_t len; +}; + +struct vm_pptdev_msi { + int vcpu; + int bus; + int slot; + int func; + int numvec; /* 0 means disabled */ + uint64_t msg; + uint64_t addr; +}; + +struct vm_pptdev_msix { + int vcpu; + int bus; + int slot; + int func; + int idx; + uint64_t msg; + uint32_t vector_control; + uint64_t addr; +}; + +struct vm_pptdev_limits { + int bus; + int slot; + int func; + int msi_limit; + int msix_limit; +}; +#else /* __FreeBSD__ */ +struct vm_pptdev { + int pptfd; +}; + +struct vm_pptdev_mmio { + int pptfd; + vm_paddr_t gpa; + vm_paddr_t hpa; + size_t len; +}; + +struct vm_pptdev_msi { + int vcpu; + int pptfd; + int numvec; /* 0 means disabled */ + uint64_t msg; + uint64_t addr; +}; + +struct vm_pptdev_msix { + int vcpu; + int pptfd; + int idx; + uint64_t msg; + uint32_t vector_control; + uint64_t addr; +}; + +struct vm_pptdev_limits { + int pptfd; + int msi_limit; + int msix_limit; +}; +#endif /* __FreeBSD__ */ + +struct vm_nmi { + int cpuid; +}; + +#ifdef __FreeBSD__ +#define MAX_VM_STATS 64 +#else +#define MAX_VM_STATS (64 + VM_MAXCPU) +#endif + +struct vm_stats { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; + +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +struct vm_x2apic { + int cpuid; + enum x2apic_state state; +}; + +struct vm_gpa_pte { + uint64_t gpa; /* in */ + uint64_t pte[4]; /* out */ + int ptenum; +}; + +struct vm_hpet_cap { + uint32_t capabilities; /* lower 32 bits of HPET capabilities */ +}; + +struct vm_suspend { + enum vm_suspend_how how; +}; + +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + struct vm_guest_paging paging; + int fault; /* outputs */ + uint64_t gpa; +}; + +struct vm_activate_cpu { + int vcpuid; +}; + +struct vm_cpuset { + int which; + int cpusetsize; +#ifndef _KERNEL + cpuset_t *cpus; +#else + void *cpus; +#endif +}; +#define VM_ACTIVE_CPUS 0 +#define VM_SUSPENDED_CPUS 1 +#define VM_DEBUG_CPUS 2 + +struct vm_intinfo { + int vcpuid; + uint64_t info1; + uint64_t info2; +}; + +struct vm_rtc_time { + time_t secs; +}; + +struct vm_rtc_data { + int offset; + uint8_t value; +}; + +#ifndef __FreeBSD__ +struct vm_devmem_offset { + int segid; + off_t offset; +}; +#endif + +struct vm_cpu_topology { + uint16_t sockets; + uint16_t cores; + uint16_t threads; + uint16_t maxcpus; +}; + +enum { + /* general routines */ + IOCNUM_ABIVERS = 0, + IOCNUM_RUN = 1, + IOCNUM_SET_CAPABILITY = 2, + IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, + IOCNUM_REINIT = 5, + + /* memory apis */ + IOCNUM_MAP_MEMORY = 10, /* deprecated */ + IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */ + IOCNUM_GET_GPA_PMAP = 12, + IOCNUM_GLA2GPA = 13, + IOCNUM_ALLOC_MEMSEG = 14, + IOCNUM_GET_MEMSEG = 15, + IOCNUM_MMAP_MEMSEG = 16, + IOCNUM_MMAP_GETNEXT = 17, + IOCNUM_GLA2GPA_NOFAULT = 18, + + /* register/state accessors */ + IOCNUM_SET_REGISTER = 20, + IOCNUM_GET_REGISTER = 21, + IOCNUM_SET_SEGMENT_DESCRIPTOR = 22, + IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, + IOCNUM_SET_REGISTER_SET = 24, + IOCNUM_GET_REGISTER_SET = 25, + + /* interrupt injection */ + IOCNUM_GET_INTINFO = 28, + IOCNUM_SET_INTINFO = 29, + IOCNUM_INJECT_EXCEPTION = 30, + IOCNUM_LAPIC_IRQ = 31, + IOCNUM_INJECT_NMI = 32, + IOCNUM_IOAPIC_ASSERT_IRQ = 33, + IOCNUM_IOAPIC_DEASSERT_IRQ = 34, + IOCNUM_IOAPIC_PULSE_IRQ = 35, + IOCNUM_LAPIC_MSI = 36, + IOCNUM_LAPIC_LOCAL_IRQ = 37, + IOCNUM_IOAPIC_PINCOUNT = 38, + IOCNUM_RESTART_INSTRUCTION = 39, + + /* PCI pass-thru */ + IOCNUM_BIND_PPTDEV = 40, + IOCNUM_UNBIND_PPTDEV = 41, + IOCNUM_MAP_PPTDEV_MMIO = 42, + IOCNUM_PPTDEV_MSI = 43, + IOCNUM_PPTDEV_MSIX = 44, + IOCNUM_GET_PPTDEV_LIMITS = 45, + + /* statistics */ + IOCNUM_VM_STATS = 50, + IOCNUM_VM_STAT_DESC = 51, + + /* kernel device state */ + IOCNUM_SET_X2APIC_STATE = 60, + IOCNUM_GET_X2APIC_STATE = 61, + IOCNUM_GET_HPET_CAPABILITIES = 62, + + /* CPU Topology */ + IOCNUM_SET_TOPOLOGY = 63, + IOCNUM_GET_TOPOLOGY = 64, + + /* legacy interrupt injection */ + IOCNUM_ISA_ASSERT_IRQ = 80, + IOCNUM_ISA_DEASSERT_IRQ = 81, + IOCNUM_ISA_PULSE_IRQ = 82, + IOCNUM_ISA_SET_IRQ_TRIGGER = 83, + + /* vm_cpuset */ + IOCNUM_ACTIVATE_CPU = 90, + IOCNUM_GET_CPUSET = 91, + IOCNUM_SUSPEND_CPU = 92, + IOCNUM_RESUME_CPU = 93, + + /* RTC */ + IOCNUM_RTC_READ = 100, + IOCNUM_RTC_WRITE = 101, + IOCNUM_RTC_SETTIME = 102, + IOCNUM_RTC_GETTIME = 103, + +#ifndef __FreeBSD__ + /* illumos-custom ioctls */ + IOCNUM_DEVMEM_GETOFFSET = 256, + IOCNUM_WRLOCK_CYCLE = 257, +#endif +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SUSPEND \ + _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) +#define VM_REINIT \ + _IO('v', IOCNUM_REINIT) +#define VM_ALLOC_MEMSEG \ + _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) +#define VM_GET_MEMSEG \ + _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) +#define VM_MMAP_MEMSEG \ + _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) +#define VM_MMAP_GETNEXT \ + _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_SEGMENT_DESCRIPTOR \ + _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_GET_SEGMENT_DESCRIPTOR \ + _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_SET_REGISTER_SET \ + _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set) +#define VM_GET_REGISTER_SET \ + _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set) +#define VM_INJECT_EXCEPTION \ + _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) +#define VM_LAPIC_IRQ \ + _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) +#define VM_LAPIC_LOCAL_IRQ \ + _IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq) +#define VM_LAPIC_MSI \ + _IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi) +#define VM_IOAPIC_ASSERT_IRQ \ + _IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_DEASSERT_IRQ \ + _IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_PULSE_IRQ \ + _IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_PINCOUNT \ + _IOR('v', IOCNUM_IOAPIC_PINCOUNT, int) +#define VM_ISA_ASSERT_IRQ \ + _IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq) +#define VM_ISA_DEASSERT_IRQ \ + _IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq) +#define VM_ISA_PULSE_IRQ \ + _IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq) +#define VM_ISA_SET_IRQ_TRIGGER \ + _IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_BIND_PPTDEV \ + _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) +#define VM_UNBIND_PPTDEV \ + _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) +#define VM_MAP_PPTDEV_MMIO \ + _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) +#define VM_PPTDEV_MSI \ + _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) +#define VM_PPTDEV_MSIX \ + _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) +#define VM_GET_PPTDEV_LIMITS \ + _IOR('v', IOCNUM_GET_PPTDEV_LIMITS, struct vm_pptdev_limits) +#define VM_INJECT_NMI \ + _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) +#define VM_STATS_IOC \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#define VM_SET_X2APIC_STATE \ + _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic) +#define VM_GET_X2APIC_STATE \ + _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) +#define VM_GET_HPET_CAPABILITIES \ + _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap) +#define VM_SET_TOPOLOGY \ + _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GET_TOPOLOGY \ + _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GET_GPA_PMAP \ + _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) +#define VM_GLA2GPA \ + _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) +#define VM_GLA2GPA_NOFAULT \ + _IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa) +#define VM_ACTIVATE_CPU \ + _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) +#define VM_GET_CPUS \ + _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_SUSPEND_CPU \ + _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu) +#define VM_RESUME_CPU \ + _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu) +#define VM_SET_INTINFO \ + _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) +#define VM_GET_INTINFO \ + _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) +#define VM_RTC_WRITE \ + _IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data) +#define VM_RTC_READ \ + _IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data) +#define VM_RTC_SETTIME \ + _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time) +#define VM_RTC_GETTIME \ + _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) +#define VM_RESTART_INSTRUCTION \ + _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) + +#ifndef __FreeBSD__ +#define VM_DEVMEM_GETOFFSET \ + _IOW('v', IOCNUM_DEVMEM_GETOFFSET, struct vm_devmem_offset) +#define VM_WRLOCK_CYCLE _IO('v', IOCNUM_WRLOCK_CYCLE) + +/* ioctls used against ctl device for vm create/destroy */ +#define VMM_IOC_BASE (('V' << 16) | ('M' << 8)) +#define VMM_CREATE_VM (VMM_IOC_BASE | 0x01) +#define VMM_DESTROY_VM (VMM_IOC_BASE | 0x02) +#define VMM_VM_SUPPORTED (VMM_IOC_BASE | 0x03) + +#define VMM_CTL_DEV "/dev/vmmctl" + +#endif + +#endif diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h new file mode 100644 index 0000000000..856b75e5cc --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_drv.h @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VMM_DRV_H_ +#define _VMM_DRV_H_ + +#ifdef _KERNEL + +#include <sys/file.h> + +struct vmm_hold; +typedef struct vmm_hold vmm_hold_t; + +struct vmm_lease; +typedef struct vmm_lease vmm_lease_t; + +/* + * Because of tangled headers, these definitions mirror their vmm_[rw]mem_cb_t + * counterparts in vmm.h. + */ +typedef int (*vmm_drv_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *); +typedef int (*vmm_drv_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t); + +extern int vmm_drv_hold(file_t *, cred_t *, vmm_hold_t **); +extern void vmm_drv_rele(vmm_hold_t *); +extern boolean_t vmm_drv_release_reqd(vmm_hold_t *); + +extern vmm_lease_t *vmm_drv_lease_sign(vmm_hold_t *, boolean_t (*)(void *), + void *); +extern void vmm_drv_lease_break(vmm_hold_t *, vmm_lease_t *); +extern boolean_t vmm_drv_lease_expired(vmm_lease_t *); + +extern void *vmm_drv_gpa2kva(vmm_lease_t *, uintptr_t, size_t); +extern int vmm_drv_msi(vmm_lease_t *, uint64_t, uint64_t); + +extern int vmm_drv_ioport_hook(vmm_hold_t *, uint_t, vmm_drv_rmem_cb_t, + vmm_drv_wmem_cb_t, void *, void **); +extern void vmm_drv_ioport_unhook(vmm_hold_t *, void **); +#endif /* _KERNEL */ + +#endif /* _VMM_DRV_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h new file mode 100644 index 0000000000..cdc56cc464 --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_impl.h @@ -0,0 +1,89 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VMM_IMPL_H_ +#define _VMM_IMPL_H_ + +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/varargs.h> +#include <sys/zone.h> + +#ifdef _KERNEL + +#define VMM_CTL_MINOR 0 + +/* + * Rather than creating whole character devices for devmem mappings, they are + * available by mmap(2)ing the vmm handle at a specific offset. These offsets + * begin just above the maximum allow guest physical address. + */ +#include <vm/vm_param.h> +#define VM_DEVMEM_START (VM_MAXUSER_ADDRESS + 1) + +struct vmm_devmem_entry { + list_node_t vde_node; + int vde_segid; + char vde_name[SPECNAMELEN + 1]; + size_t vde_len; + off_t vde_off; +}; +typedef struct vmm_devmem_entry vmm_devmem_entry_t; + +typedef struct vmm_zsd vmm_zsd_t; + +enum vmm_softc_state { + VMM_HELD = 1, /* external driver(s) possess hold on the VM */ + VMM_CLEANUP = 2, /* request that holds are released */ + VMM_PURGED = 4, /* all hold have been released */ + VMM_BLOCK_HOOK = 8, /* mem hook install temporarily blocked */ + VMM_DESTROY = 16 /* VM is destroyed, softc still around */ +}; + +struct vmm_softc { + list_node_t vmm_node; + struct vm *vmm_vm; + minor_t vmm_minor; + char vmm_name[VM_MAX_NAMELEN]; + list_t vmm_devmem_list; + + kcondvar_t vmm_cv; + list_t vmm_holds; + uint_t vmm_flags; + boolean_t vmm_is_open; + + kmutex_t vmm_lease_lock; + list_t vmm_lease_list; + uint_t vmm_lease_blocker; + kcondvar_t vmm_lease_cv; + krwlock_t vmm_rwlock; + + /* For zone specific data */ + list_node_t vmm_zsd_linkage; + zone_t *vmm_zone; + vmm_zsd_t *vmm_zsd; +}; +typedef struct vmm_softc vmm_softc_t; + +void vmm_zsd_init(void); +void vmm_zsd_fini(void); +int vmm_zsd_add_vm(vmm_softc_t *sc); +void vmm_zsd_rem_vm(vmm_softc_t *sc); +int vmm_do_vm_destroy(vmm_softc_t *, boolean_t); + +#endif /* _KERNEL */ + +#endif /* _VMM_IMPL_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h new file mode 100644 index 0000000000..f10f407164 --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h @@ -0,0 +1,137 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +#include <sys/mman.h> + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); + +typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Emulate the decoded 'vie' instruction. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * s + */ +int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t mrr, + mem_region_write_t mrw, void *mrarg); + +int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size); + +/* + * Returns 1 if an alignment check exception should be injected and 0 otherwise. + */ +int vie_alignment_check(int cpl, int operand_size, uint64_t cr0, + uint64_t rflags, uint64_t gla); + +/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */ +int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); + +uint64_t vie_size2mask(int size); + +int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot, + uint64_t *gla); + +#ifdef _KERNEL +/* + * APIs to fetch and decode the instruction from nested page fault handler. + * + * 'vie' must be initialized before calling 'vmm_fetch_instruction()' + */ +int vmm_fetch_instruction(struct vm *vm, int cpuid, + struct vm_guest_paging *guest_paging, + uint64_t rip, int inst_length, struct vie *vie, + int *is_fault); + +/* + * Translate the guest linear address 'gla' to a guest physical address. + * + * retval is_fault Interpretation + * 0 0 'gpa' contains result of the translation + * 0 1 An exception was injected into the guest + * EFAULT N/A An unrecoverable hypervisor error occurred + */ +int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +/* + * Like vm_gla2gpa, but no exceptions are injected into the guest and + * PTEs are not changed. + */ +int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); + +/* + * Decode the instruction fetched into 'vie' so it can be emulated. + * + * 'gla' is the guest linear address provided by the hardware assist + * that caused the nested page table fault. It is used to verify that + * the software instruction decoding is in agreement with the hardware. + * + * Some hardware assists do not provide the 'gla' to the hypervisor. + * To skip the 'gla' verification for this or any other reason pass + * in VIE_INVALID_GLA instead. + */ +#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ +int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, + enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); +#endif /* _KERNEL */ + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ |
