diff options
Diffstat (limited to 'usr/src/uts/i86pc')
151 files changed, 50015 insertions, 353 deletions
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index ad85b5ba25..a0509bf21d 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -23,7 +23,7 @@ # Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. # # Copyright (c) 2010, Intel Corporation. -# Copyright 2018 Joyent, Inc. +# Copyright 2019 Joyent, Inc. # Copyright 2019 OmniOS Community Edition (OmniOSce) Association. # # This Makefile defines file modules in the directory uts/i86pc @@ -65,6 +65,8 @@ CORE_OBJS += \ hardclk.o \ hat_i86.o \ hat_kdi.o \ + hma.o \ + hma_asm.o \ hma_fpu.o \ hment.o \ hold_page.o \ @@ -113,6 +115,7 @@ CORE_OBJS += \ pmem.o \ ppage.o \ pwrnow.o \ + seg_vmm.o \ smt.o \ speedstep.o \ ssp.o \ @@ -242,6 +245,53 @@ UPPC_OBJS += uppc.o psm_common.o XSVC_OBJS += xsvc.o AMD_IOMMU_OBJS += amd_iommu.o amd_iommu_impl.o amd_iommu_acpi.o \ amd_iommu_cmd.o amd_iommu_log.o amd_iommu_page_tables.o +VMM_OBJS += vmm.o \ + vmm_sol_dev.o \ + vmm_host.o \ + vmm_instruction_emul.o \ + vmm_ioport.o \ + vmm_lapic.o \ + vmm_mem.o \ + vmm_stat.o \ + vmm_util.o \ + x86.o \ + iommu.o \ + vdev.o \ + vatpic.o \ + vatpit.o \ + vhpet.o \ + vioapic.o \ + vlapic.o \ + vrtc.o \ + vpmtmr.o \ + ept.o \ + vmcs.o \ + vmx_msr.o \ + vmx.o \ + vmx_support.o \ + vtd.o \ + vtd_sol.o \ + svm.o \ + svm_msr.o \ + npt.o \ + vmcb.o \ + svm_support.o \ + amdv.o \ + gipt.o \ + vmm_sol_vm.o \ + vmm_sol_glue.o \ + vmm_sol_ept.o \ + vmm_sol_rvi.o \ + vmm_support.o \ + vmm_zsd.o + +VIONA_OBJS += viona_main.o \ + viona_ring.o \ + viona_rx.o \ + viona_tx.o \ + viona_hook.o \ + +PPT_OBJS += ppt.o # # Build up defines and paths. @@ -278,3 +328,10 @@ ASSYM_DEPS += \ CPR_IMPL_OBJS = cpr_impl.o cpr_wakecode.o $(KDI_ASSYM_DEPS:%=$(OBJS_DIR)/%): $(DSF_DIR)/$(OBJS_DIR)/kdi_assym.h + +# +# Intel Integrated Memory Controller +# (Sandy Bridge - Cascade Lake) +# +IMC_OBJS = imc.o imc_decode.o imc_dump.o +IMCSTUB_OBJS = imcstub.o diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc index 2f18cb206d..47ca5bf8e9 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc +++ b/usr/src/uts/i86pc/Makefile.i86pc @@ -24,6 +24,7 @@ # # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2013 Andrew Stormont. All rights reserved. +# Copyright 2019 Joyent, Inc. # Copyright 2019 OmniOS Community Edition (OmniOSce) Association. # # @@ -138,11 +139,11 @@ AS_INC_PATH += -I$(DSF_DIR)/$(OBJS_DIR) # # The following must be defined for all implementations: # -# MAPFILE: ld mapfile for the build of kernel/unix. +# UNIX_MAPFILE: ld mapfile for the build of kernel/unix. # MODSTUBS: Module stubs source file. # GENASSYM_SRC: genassym.c # -MAPFILE = $(UTSBASE)/$(PLATFORM)/conf/Mapfile +UNIX_MAPFILE = $(UTSBASE)/$(PLATFORM)/conf/Mapfile MODSTUBS = $(UTSBASE)/intel/ia32/ml/modstubs.s GENASSYM_SRC = $(UTSBASE)/$(PLATFORM)/ml/genassym.c OFFSETS_SRC = $(UTSBASE)/$(PLATFORM)/ml/offsets.in @@ -256,6 +257,10 @@ DRV_KMODS += amd_iommu DRV_KMODS += dr DRV_KMODS += ioat DRV_KMODS += fipe +DRV_KMODS += vmm +DRV_KMODS += viona +DRV_KMODS += ppt +DRV_KMODS += imc imcstub DRV_KMODS += cpudrv diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index 8439b5c898..3d3c8131c1 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -22,6 +22,7 @@ # # Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2015 Igor Kozhukhov <ikozhukhov@gmail.com> +# Copyright 2019 Joyent, Inc. # Copyright 2019 OmniOS Community Edition (OmniOSce) Association. # @@ -120,6 +121,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/ioat/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/imc/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(SRC)/common/mc/imc/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/pci/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -216,6 +225,35 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/dboot/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/amd/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/intel/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/io/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/intel/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/amd/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/viona/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + # # dboot stuff is always 32 bit, linked to run with phys_addr == virt_addr # @@ -316,7 +354,8 @@ $(OBJS_DIR)/%.o: $(SRC)/common/atomic/$(ATOMIC_SUBDIR)/%.s # $(OBJS_DIR)/dtracestubs.s: $(UNIX_O) $(LIBS) - $(NM) -u $(UNIX_O) $(LIBS) | $(GREP) __dtrace_probe_ | $(SORT) | \ + $(NM) -u $(UNIX_O) $(LIBS) | \ + $(EGREP) '(__dtrace_probe_|smap_(disable|enable))' | $(SORT) | \ $(UNIQ) | $(AWK) '{ \ printf("\t.globl %s\n\t.type %s,@function\n%s:\n", \ $$1, $$1, $$1); }' > $(OBJS_DIR)/dtracestubs.s diff --git a/usr/src/uts/i86pc/dboot/dboot_startkern.c b/usr/src/uts/i86pc/dboot/dboot_startkern.c index 87261b4df8..12b97be5e5 100644 --- a/usr/src/uts/i86pc/dboot/dboot_startkern.c +++ b/usr/src/uts/i86pc/dboot/dboot_startkern.c @@ -23,7 +23,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ @@ -1463,6 +1463,80 @@ dboot_process_modules(void) check_images(); } +#define CORRUPT_REGION_START 0xc700000 +#define CORRUPT_REGION_SIZE 0x100000 +#define CORRUPT_REGION_END (CORRUPT_REGION_START + CORRUPT_REGION_SIZE) + +static void +dboot_add_memlist(uint64_t start, uint64_t end) +{ + if (end > max_mem) + max_mem = end; + + /* + * Well, this is sad. On some systems, there is a region of memory that + * can be corrupted until some number of seconds after we have booted. + * And the BIOS doesn't tell us that this memory is unsafe to use. And + * we don't know how long it's dangerous. So we'll chop out this range + * from any memory list that would otherwise be usable. Note that any + * system of this type will give us the new-style (0x40) memlist, so we + * need not fix up the other path below. + * + * However, if we're boot-loaded from something that doesn't have a + * RICHMOND-16 workaround (which on many systems is just fine), it could + * actually use this region for the boot modules; if we remove it from + * the memlist, we'll keel over when trying to access the region. + * + * So, if we see that a module intersects the region, we presume it's + * OK. + */ + + if (find_boot_prop("disable-RICHMOND-16") != NULL) + goto out; + + for (uint32_t i = 0; i < bi->bi_module_cnt; i++) { + native_ptr_t mod_start = modules[i].bm_addr; + native_ptr_t mod_end = modules[i].bm_addr + modules[i].bm_size; + + if (mod_start < CORRUPT_REGION_END && + mod_end >= CORRUPT_REGION_START) { + if (prom_debug) { + dboot_printf("disabling RICHMOND-16 workaround " + "due to module #%d: " + "name %s addr %lx size %lx\n", + i, (char *)(uintptr_t)modules[i].bm_name, + (ulong_t)modules[i].bm_addr, + (ulong_t)modules[i].bm_size); + } + goto out; + } + } + + if (start < CORRUPT_REGION_START && end > CORRUPT_REGION_START) { + memlists[memlists_used].addr = start; + memlists[memlists_used].size = + CORRUPT_REGION_START - start; + ++memlists_used; + if (end > CORRUPT_REGION_END) + start = CORRUPT_REGION_END; + else + return; + } + + if (start >= CORRUPT_REGION_START && start < CORRUPT_REGION_END) { + if (end <= CORRUPT_REGION_END) + return; + start = CORRUPT_REGION_END; + } + +out: + memlists[memlists_used].addr = start; + memlists[memlists_used].size = end - start; + ++memlists_used; + if (memlists_used > MAX_MEMLIST) + dboot_panic("too many memlists"); +} + /* * We then build the phys_install memlist from the multiboot information. */ @@ -1506,13 +1580,7 @@ dboot_process_mmap(void) */ switch (type) { case 1: - if (end > max_mem) - max_mem = end; - memlists[memlists_used].addr = start; - memlists[memlists_used].size = end - start; - ++memlists_used; - if (memlists_used > MAX_MEMLIST) - dboot_panic("too many memlists"); + dboot_add_memlist(start, end); break; case 2: rsvdmemlists[rsvdmemlists_used].addr = start; @@ -2306,11 +2374,7 @@ startup_kernel(void) * Need correct target_kernel_text value */ #if defined(_BOOT_TARGET_amd64) - target_kernel_text = KERNEL_TEXT_amd64; -#elif defined(__xpv) - target_kernel_text = KERNEL_TEXT_i386_xpv; -#else - target_kernel_text = KERNEL_TEXT_i386; + target_kernel_text = KERNEL_TEXT; #endif DBG(target_kernel_text); diff --git a/usr/src/uts/i86pc/imc/Makefile b/usr/src/uts/i86pc/imc/Makefile new file mode 100644 index 0000000000..f649faa4d4 --- /dev/null +++ b/usr/src/uts/i86pc/imc/Makefile @@ -0,0 +1,51 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +UTSBASE = ../.. + +MODULE = imc +OBJECTS = $(IMC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(IMC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/i86pc/io/imc + +include $(UTSBASE)/i86pc/Makefile.i86pc + +ALL_TARGET = $(BINARY) $(CONFMOD) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +CPPFLAGS += -I$(CONF_SRCDIR) +LDFLAGS += -dy + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86pc/imcstub/Makefile b/usr/src/uts/i86pc/imcstub/Makefile new file mode 100644 index 0000000000..afec0a8127 --- /dev/null +++ b/usr/src/uts/i86pc/imcstub/Makefile @@ -0,0 +1,49 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +UTSBASE = ../.. + +MODULE = imcstub +OBJECTS = $(IMCSTUB_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(IMCSTUB_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) + +include $(UTSBASE)/i86pc/Makefile.i86pc + +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +LDFLAGS += -dy -Ndrv/imc + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86pc/io/imc/imc.c b/usr/src/uts/i86pc/io/imc/imc.c new file mode 100644 index 0000000000..25ba86061b --- /dev/null +++ b/usr/src/uts/i86pc/io/imc/imc.c @@ -0,0 +1,2972 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Generic Intel Integrated Memory Controller (IMC) Driver + * + * This driver talks to the CPU's IMC to understand the detailed topology of the + * processor and to determine how to map between physical addresses to the + * corresponding DIMM. This driver supports the following generations of Intel + * chips: + * + * - Sandy Bridge + * - Ivy Bridge + * - Haswell + * - Broadwell + * - Skylake / Cascade Lake + * + * Memory Decoding + * --------------- + * + * For more detailed summaries of the memory decoding process, please refer to + * the Intel External Design Specifications for the corresponding processor. + * What follows is a rough overview of how the memory decoding system works. + * + * First, we'd like to define the following concepts: + * + * SYSTEM ADDRESS + * + * This is a physical address that the operating system normally uses. This + * address may refer to DRAM, it may refer to memory mapped PCI + * configuration space or device registers, or it may refer to other parts + * of the system's memory map, such as the extended advanced programmable + * interrupt controller (xAPIC), etc. + * + * DIMM + * + * Dual-inline memory module. This refers to a physical stick of volatile + * memory that is inserted into a slot on the motherboard. + * + * RANK + * + * A potential sub-division of a DIMM. A DIMM's memory capacity is divided + * into a number of equal sized ranks. For example, an 8 GiB DIMM, may have + * 1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks. + * + * RANK ADDRESS + * + * An address that exists in the context of a given rank on a DIMM. All + * ranks have overlapping addresses, so the address 0x400 exists on all + * ranks on a given DIMM. + * + * CHANNEL + * + * Multiple DIMMs may be combined into a single channel. The channel + * represents the combined memory of all the DIMMs. A given channel only + * ever exists on a socket and is bound to a single memory controller. + * + * CHANNEL ADDRESS + * + * This is an address that exists logically on a channel. Each address on a + * channel maps to a corresponding DIMM that exists on that channel. The + * address space on one channel is independent from that on another. This + * means that address 0x1000 can exist on each memory channel in the + * system. + * + * INTERLEAVE + * + * There are several different cases where interleaving occurs on the + * system. For example, addresses may be interleaved across sockets, + * memory channels, or DIMM ranks. When addresses are interleaved, then + * some number of bits in an address are used to select which target to go + * to (usually through a look up table). The effect of interleaving is that + * addresses that are next to one another may not all go to the same + * device. The following image shows a non-interleaving case. + * + * 0x0fff +-----+ +-----+ 0x7ff + * | |\___________/| | + * | | __________ | (b) | + * | | / \| | + * 0x0800 |=====|= +-----+ 0x000 +-----+ 0x7ff + * | | \______________________________/| | + * | | _______________________________ | (a) | + * | |/ \| | + * 0x0000 +-----+ +-----+ 0x000 + * + * In this example of non-interleaving, addresses 0x0000 to 0x07ff go to + * device (a). While, addresses 0x08000 to 0xfff, go to device (b). + * However, each range is divided into the same number of components. + * + * If instead, we were to look at that with interleaving, what we might say + * is that rather than splitting the range in half, we might say that if + * the address has bit 8 set (0x100), then it goes to (b), otherwise it + * goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a). + * 0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a) + * again, and then 0x300 to 0x2ff would go back to (b). This would continue + * for a while. This would instead look something more like: + * + * + * 0x0fff +-----+ A: 0x7ff +---------+ B: 0x7ff +---------+ + * | (b) | | e00-eff | | f00-fff | + * 0x0f00 |-----| 0x700 +---------+ 0x700 +---------+ + * | (a) | | c00-cff | | d00-dff | + * 0x0e00 ~~~~~~~ 0x600 +---------+ 0x600 +---------+ + * *** | a00-aff | | b00-bff | + * 0x0400 ~~~~~~~ 0x500 +---------+ 0x500 +---------+ + * | (b) | | 800-8ff | | 900-9ff | + * 0x0300 |-----| 0x400 +---------+ 0x400 +---------+ + * | (a) | | 600-6ff | | 700-7ff | + * 0x0200 |-----| 0x300 +---------+ 0x300 +---------+ + * | (b) | | 400-4ff | | 500-5ff | + * 0x0100 |-----| 0x200 +---------+ 0x200 +---------+ + * | (a) | | 200-2ff | | 300-3ff | + * 0x0000 +-----+ 0x100 +---------+ 0x100 +---------+ + * | 000-0ff | | 100-1ff | + * 0x000 +---------+ 0x000 +---------+ + * + * In this example we've performed two-way interleaving. The number of ways + * that something can interleave varies based on what we're interleaving + * between. + * + * MEMORY CONTROLLER + * + * A given processor die (see uts/i86pc/os/cpuid.c) contains a number of + * memory controllers. Usually 1 or two. Each memory controller supports a + * given number of DIMMs, which are divided across multiple channels. + * + * TARGET ADDRESS DECODER + * + * The target address decoder (TAD) is responsible for taking a system + * address and transforming it into a channel address based on the rules + * that are present. Each memory controller has a corresponding TAD. The + * TAD is often contained in a device called a 'Home Agent'. + * + * SYSTEM ADDRESS DECODER + * + * The system address decoder (SAD) is responsible for taking a system + * address and directing it to the right place, whether this be memory or + * otherwise. There is a single memory controller per socket (see + * uts/i86pc/os/cpuid.c) that is shared between all the cores currently. + * + * NODE IDENTIFIER + * + * The node identifier is used to uniquely identify an element in the + * various routing topologies on the die (see uts/i86pc/os/cpuid.c for the + * definition of 'die'). One can roughly think about this as a unique + * identifier for the socket itself. In general, the primary node ID for a + * socket should map to the socket APIC ID. + * + * Finding Devices + * --------------- + * + * There is a bit of a chicken and egg problem on Intel systems and in the + * device driver interface. The information that we need in the system is spread + * out amongst a large number of different PCI devices that the processor + * exposes. The number of such devices can vary based on the processor + * generation and the specific SKU in the processor. To deal with this, we break + * the driver into two different components: a stub driver and the full driver. + * + * The stub driver has aliases for all known PCI devices that we might attach to + * in a given generation on the system. This driver is called 'imcstub'. When a + * stub attaches, it just registers itself with the main driver, upon which it + * has a module dependency. + * + * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it + * kicks off a scan of the device tree which takes place in a task queue. Once + * there, it determines the number of devices that it expects to exist by + * walking the tree and comparing it against the generation-specific table. + * + * If all devices are found, we'll go ahead and read through all the devices and + * build a map of all the information we need to understand the topology of the + * system and to be able to decode addresses. We do this here, because we can be + * asked to perform decoding in dangerous contexts (after taking an MCE, panic, + * etc) where we don't want to have to rely on the broader kernel functioning at + * this point in time. + * + * Once our topology is built, we'll create minor nodes which are used by the + * fault management architecture to query for information and register our + * decoding functionality with the kernel. + * + * PCI Numbering + * ------------- + * + * For each device that we care about, Intel defines the device and function + * that we can expect to find the information and PCI configuration space + * registers that we care about at. However, the PCI bus is not well defined. + * Devices that are on the same socket use the same set of bus numbers; however, + * some sockets have multiple device numbers that they'll use to represent + * different classes. These bus numbers are programmed by systems firmware as + * part of powering on the system. This means, that we need the ability to + * map together these disparate ranges ourselves. + * + * There is a device called a utility box (UBOX), which exists per-socket and + * maps the different sockets together. We use this to determine which devices + * correspond to which sockets. + * + * Mapping Sockets + * --------------- + * + * Another wrinkle is that the way that the OS sees the numbering of the CPUs is + * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more + * information). However, to map to the corresponding socket, we need to look at + * the socket's node ID. The order of PCI buses in the system is not required to + * have any relation to the socket ID. Therefore, we have to have yet another + * indirection table in the imc_t. + * + * Exposing Data + * ------------- + * + * We expose topology data to FMA using the OS-private memory controller + * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a + * number of specific interfaces that we can then implement. The ioctl API asks + * us for a snapshot of data, which basically has us go through and send an + * nvlist_t to userland. This nvlist_t is constructed as part of the scan + * process. This nvlist uses the version 1 format, which more explicitly encodes + * the topology in a series of nested nvlists. + * + * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the + * decoder and ask it to perform decoding. + * + * Decoding Addresses + * ------------------ + * + * The decoding logic can be found in common/imc/imc_decode.c. This file is + * shared between the kernel and userland to allow for easier testing and + * additional flexibility in operation. The decoding process happens in a few + * different phases. + * + * The first phase, is to determine which memory controller on which socket is + * responsible for this data. To determine this, we use the system address + * decoder and walk the rules, looking for the correct target. There are various + * manipulations to the address that exist which are used to determine which + * index we use. The way that we interpret the output of the rule varies + * somewhat based on the generation. Sandy Bridge just has a node ID which + * points us to the socket with its single IMC. On Ivy Bridge through Broadwell, + * the memory controller to use is also encoded in part of the node ID. Finally, + * on Skylake, the SAD tells us which socket to look at. The socket in question + * then has a routing table which tells us which channel on which memory + * controller that is local to that socket. + * + * Once we have the target memory controller, we walk the list of target address + * decoder rules. These rules can help tell us which channel we care about + * (which is required on Sandy Bridge through Broadwell) and then describe some + * amount of the interleaving rules which are used to turn the system address + * into a channel address. + * + * Once we know the channel and the channel address, we walk the rank interleave + * rules which help us determine which DIMM and the corresponding rank on it + * that the corresponding channel address is on. It also has logic that we need + * to use to determine how to transform a channel address into an address on + * that specific rank. Once we have that, then the initial decoding is done. + * + * The logic in imc_decode.c is abstracted away from the broader kernel CMI + * logic. This is on purpose and allows us not only an easier time unit testing + * the logic, but also allows us to express more high fidelity errors that are + * translated into a much smaller subset. This logic is exercised in the + * 'imc_test' program which is built in 'test/os-tests/tests/imc'. + * + * Limitations + * ----------- + * + * Currently, this driver has the following limitations: + * + * o It doesn't decode the row and column addresses. + * o It doesn't encode from a DIMM address to a system address. + * o It doesn't properly support lockstep and mirroring modes on Sandy Bridge - + * Broadwell platforms. + * o It doesn't support virtual lockstep and adaptive mirroring on Purley + * platforms. + * o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs. + * o It doesn't know how to decode three way channel interleaving. + * + * None of these are intrinsic problems to the driver, it's mostly a matter of + * having proper documentation and testing. + */ + +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/types.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/cred.h> +#include <sys/pci.h> +#include <sys/sysmacros.h> +#include <sys/avl.h> +#include <sys/stat.h> +#include <sys/policy.h> + +#include <sys/cpu_module.h> +#include <sys/mc.h> +#include <sys/mc_intel.h> + +#include "imc.h" + +/* + * These tables contain generational data that varies between processor + * generation such as the maximum number of sockets, memory controllers, and the + * offsets of the various registers. + */ + +static const imc_gen_data_t imc_gen_data_snb = { + .igd_max_sockets = 4, + .igd_max_imcs = 2, + .igd_max_channels = 4, + .igd_max_dimms = 3, + .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, + .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, + IMC_REG_MC_MTR2 }, + .igd_mcmtr_offset = 0x7c, + .igd_tolm_offset = 0x80, + .igd_tohm_low_offset = 0x84, + .igd_sad_dram_offset = 0x80, + .igd_sad_ndram_rules = 10, + .igd_sad_nodeid_offset = 0x40, + .igd_tad_nrules = 12, + .igd_tad_rule_offset = 0x40, + .igd_tad_chan_offset = 0x90, + .igd_tad_sysdef = 0x80, + .igd_tad_sysdef2 = 0x84, + .igd_mc_mirror = 0xac, + .igd_rir_nways = 5, + .igd_rir_way_offset = 0x108, + .igd_rir_nileaves = 8, + .igd_rir_ileave_offset = 0x120, + .igd_ubox_cpubusno_offset = 0xd0, +}; + +static const imc_gen_data_t imc_gen_data_ivb = { + .igd_max_sockets = 4, + .igd_max_imcs = 2, + .igd_max_channels = 4, + .igd_max_dimms = 3, + .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, + .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, + IMC_REG_MC_MTR2 }, + .igd_mcmtr_offset = 0x7c, + .igd_tolm_offset = 0x80, + .igd_tohm_low_offset = 0x84, + .igd_sad_dram_offset = 0x60, + .igd_sad_ndram_rules = 20, + .igd_sad_nodeid_offset = 0x40, + .igd_tad_nrules = 12, + .igd_tad_rule_offset = 0x40, + .igd_tad_chan_offset = 0x90, + .igd_tad_sysdef = 0x80, + .igd_tad_sysdef2 = 0x84, + .igd_mc_mirror = 0xac, + .igd_rir_nways = 5, + .igd_rir_way_offset = 0x108, + .igd_rir_nileaves = 8, + .igd_rir_ileave_offset = 0x120, + .igd_ubox_cpubusno_offset = 0xd0, +}; + +static const imc_gen_data_t imc_gen_data_has_brd = { + .igd_max_sockets = 4, + .igd_max_imcs = 2, + .igd_max_channels = 4, + .igd_max_dimms = 3, + .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX, + .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, + IMC_REG_MC_MTR2 }, + .igd_mcmtr_offset = 0x7c, + .igd_tolm_offset = 0xd0, + .igd_tohm_low_offset = 0xd4, + .igd_tohm_hi_offset = 0xd8, + .igd_sad_dram_offset = 0x60, + .igd_sad_ndram_rules = 20, + .igd_sad_nodeid_offset = 0x40, + .igd_tad_nrules = 12, + .igd_tad_rule_offset = 0x40, + .igd_tad_chan_offset = 0x90, + .igd_tad_sysdef = 0x80, + .igd_tad_sysdef2 = 0x84, + .igd_mc_mirror = 0xac, + .igd_rir_nways = 5, + .igd_rir_way_offset = 0x108, + .igd_rir_nileaves = 8, + .igd_rir_ileave_offset = 0x120, + .igd_ubox_cpubusno_offset = 0xd0, +}; + +static const imc_gen_data_t imc_gen_data_skx = { + .igd_max_sockets = 8, + .igd_max_imcs = 2, + .igd_max_channels = 3, + .igd_max_dimms = 2, + .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, + .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 }, + .igd_mcmtr_offset = 0x87c, + .igd_topo_offset = 0x88, + .igd_tolm_offset = 0xd0, + .igd_tohm_low_offset = 0xd4, + .igd_tohm_hi_offset = 0xd8, + .igd_sad_dram_offset = 0x60, + .igd_sad_ndram_rules = 24, + .igd_sad_nodeid_offset = 0xc0, + .igd_tad_nrules = 8, + .igd_tad_rule_offset = 0x850, + .igd_tad_chan_offset = 0x90, + .igd_rir_nways = 4, + .igd_rir_way_offset = 0x108, + .igd_rir_nileaves = 4, + .igd_rir_ileave_offset = 0x120, + .igd_ubox_cpubusno_offset = 0xcc, +}; + +/* + * This table contains all of the devices that we're looking for from a stub + * perspective. These are organized by generation. Different generations behave + * in slightly different ways. For example, Sandy Bridge through Broadwell use + * unique PCI IDs for each PCI device/function combination that appears. Whereas + * Skylake based systems use the same PCI ID; however, different device/function + * values indicate that the IDs are used for different purposes. + */ +/* BEGIN CSTYLED */ +static const imc_stub_table_t imc_stub_table[] = { + /* Sandy Bridge */ + { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" }, + { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" }, + { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" }, + { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" }, + { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" }, + { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" }, + { IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" }, + { IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" }, + { IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" }, + { IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" }, + { IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" }, + { IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" }, + /* Ivy Bridge */ + { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" }, + { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" }, + { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" }, + { IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" }, + { IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" }, + { IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" }, + { IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" }, + { IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" }, + { IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" }, + { IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" }, + /* Haswell */ + { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" }, + { IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" }, + { IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" }, + { IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" }, + { IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" }, + { IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" }, + { IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" }, + /* Broadwell Devices */ + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" }, + { IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" }, + { IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" }, + { IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" }, + { IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" }, + { IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" }, + { IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" }, + /* Skylake and Cascade Lake Devices */ + { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" }, + + /* + * There is one SAD MC Route type device per core! Because of this a + * wide array of device and functions are allocated. For now, we list + * all 28 of them out. + */ + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" }, + + { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" }, +}; +/* END CSTYLED */ + +#define IMC_PCI_VENDOR_INTC 0x8086 + +/* + * Our IMC data is global and statically set up during a combination of + * _init(9E) and attach(9E). While we have a module dependency between the PCI + * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't + * guarantee that the imc driver has finished attaching. As such we make sure + * that it can operate without it being attached in any way. + */ +static imc_t *imc_data = NULL; + +/* + * By default we should not allow the stubs to detach as we don't have a good + * way of forcing them to attach again. This is provided in case someone does + * want to allow the driver to unload. + */ +int imc_allow_detach = 0; + +static void +imc_set_gen_data(imc_t *imc) +{ + switch (imc->imc_gen) { + case IMC_GEN_SANDY: + imc->imc_gen_data = &imc_gen_data_snb; + break; + case IMC_GEN_IVY: + imc->imc_gen_data = &imc_gen_data_ivb; + break; + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + imc->imc_gen_data = &imc_gen_data_has_brd; + break; + case IMC_GEN_SKYLAKE: + imc->imc_gen_data = &imc_gen_data_skx; + break; + default: + dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: " + "set to unknown generation: %u", imc->imc_gen); + } +} + +/* + * If our device (dev_info_t) does not have a non-zero unit address, then + * devfsadmd will not pay attention to us at all. Therefore we need to set the + * unit address below, before we create minor nodes. + * + * The rest of the system expects us to have one minor node per socket. The + * minor node ID should be the ID of the socket. + */ +static boolean_t +imc_create_minors(imc_t *imc) +{ + uint_t i; + + ddi_set_name_addr(imc->imc_dip, "1"); + for (i = 0; i < imc->imc_nsockets; i++) { + char buf[MAXNAMELEN]; + + if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >= + sizeof (buf)) { + goto fail; + } + + if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i, + "ddi_mem_ctrl", 0) != DDI_SUCCESS) { + dev_err(imc->imc_dip, CE_WARN, "failed to create " + "minor node %u: %s", i, buf); + goto fail; + } + } + return (B_TRUE); + +fail: + ddi_remove_minor_node(imc->imc_dip, NULL); + return (B_FALSE); +} + +/* + * Check the current MC route value for this SAD. On Skylake systems there is + * one per core. Every core should agree. If not, we will not trust the SAD + * MCROUTE values and this will cause system address decoding to fail on + * skylake. + */ +static void +imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub) +{ + uint32_t val; + + val = pci_config_get32(stub->istub_cfgspace, + IMC_REG_SKX_SAD_MC_ROUTE_TABLE); + if (val == PCI_EINVAL32) { + sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; + return; + } + + if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) { + sad->isad_flags |= IMC_SAD_MCROUTE_VALID; + sad->isad_mcroute.ismc_raw_mcroute = val; + return; + } + + /* + * Occasionally we see MC ROUTE table entries with a value of zero. + * We should ignore those for now. + */ + if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) { + dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch " + "with socket. SAD has val 0x%x, system has %x\n", + val, sad->isad_mcroute.ismc_raw_mcroute); + sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE; + } +} + +/* + * On Skylake, many of the devices that we care about are on separate PCI Buses. + * These can be mapped together by the DECS register. However, we need to know + * how to map different buses together so that we can more usefully associate + * information. The set of buses is all present in the DECS register. We'll + * effectively assign sockets to buses. This is also still something that comes + * up on pre-Skylake systems as well. + */ +static boolean_t +imc_map_buses(imc_t *imc) +{ + imc_stub_t *stub; + uint_t nsock; + + /* + * Find the UBOX_DECS registers so we can establish socket mappings. On + * Skylake, there are three different sets of buses that we need to + * cover all of our devices, while there are only two before that. + */ + for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL; + stub = AVL_NEXT(&imc->imc_stubs, stub)) { + uint32_t busno; + + if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) { + continue; + } + + busno = pci_config_get32(stub->istub_cfgspace, + imc->imc_gen_data->igd_ubox_cpubusno_offset); + if (busno == PCI_EINVAL32) { + dev_err(imc->imc_dip, CE_WARN, "failed to read " + "UBOX_DECS CPUBUSNO0: invalid PCI read"); + return (B_FALSE); + } + + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + imc->imc_sockets[nsock].isock_nbus = 3; + imc->imc_sockets[nsock].isock_bus[0] = + IMC_UBOX_CPUBUSNO_0(busno); + imc->imc_sockets[nsock].isock_bus[1] = + IMC_UBOX_CPUBUSNO_1(busno); + imc->imc_sockets[nsock].isock_bus[2] = + IMC_UBOX_CPUBUSNO_2(busno); + } else { + imc->imc_sockets[nsock].isock_bus[0] = + IMC_UBOX_CPUBUSNO_0(busno); + imc->imc_sockets[nsock].isock_bus[1] = + IMC_UBOX_CPUBUSNO_1(busno); + imc->imc_sockets[nsock].isock_nbus = 2; + } + nsock++; + } + imc->imc_nsockets = nsock; + + return (B_TRUE); +} + +/* + * For a given stub that we've found, map it to its corresponding socket based + * on the PCI bus that it has. + */ +static imc_socket_t * +imc_map_find_socket(imc_t *imc, imc_stub_t *stub) +{ + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + uint_t bus; + + for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) { + if (imc->imc_sockets[i].isock_bus[bus] == + stub->istub_bus) { + return (&imc->imc_sockets[i]); + } + } + } + + return (NULL); +} + +static boolean_t +imc_map_stubs(imc_t *imc) +{ + imc_stub_t *stub; + + if (!imc_map_buses(imc)) { + return (B_FALSE); + } + + stub = avl_first(&imc->imc_stubs); + for (stub = avl_first(&imc->imc_stubs); stub != NULL; + stub = AVL_NEXT(&imc->imc_stubs, stub)) { + imc_socket_t *sock = imc_map_find_socket(imc, stub); + + if (sock == NULL) { + dev_err(imc->imc_dip, CE_WARN, "found stub type %u " + "PCI%x,%x with bdf %u/%u/%u that does not match a " + "known PCI bus for any of %u sockets", + stub->istub_table->imcs_type, stub->istub_vid, + stub->istub_did, stub->istub_bus, stub->istub_dev, + stub->istub_func, imc->imc_nsockets); + continue; + } + + /* + * We don't have to worry about duplicates here. We check to + * make sure that we have unique bdfs here. + */ + switch (stub->istub_table->imcs_type) { + case IMC_TYPE_MC0_M2M: + sock->isock_imcs[0].icn_m2m = stub; + break; + case IMC_TYPE_MC1_M2M: + sock->isock_imcs[1].icn_m2m = stub; + break; + case IMC_TYPE_MC0_MAIN0: + sock->isock_nimc++; + sock->isock_imcs[0].icn_main0 = stub; + + /* + * On Skylake, the MAIN0 does double duty as channel + * zero and as the TAD. + */ + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + sock->isock_imcs[0].icn_nchannels++; + sock->isock_imcs[0].icn_channels[0].ich_desc = + stub; + sock->isock_tad[0].itad_stub = stub; + sock->isock_ntad++; + } + break; + case IMC_TYPE_MC0_MAIN1: + sock->isock_imcs[0].icn_main1 = stub; + break; + case IMC_TYPE_MC1_MAIN0: + sock->isock_nimc++; + sock->isock_imcs[1].icn_main0 = stub; + + /* + * On Skylake, the MAIN0 does double duty as channel + * zero and as the TAD. + */ + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + sock->isock_imcs[1].icn_nchannels++; + sock->isock_imcs[1].icn_channels[0].ich_desc = + stub; + sock->isock_tad[1].itad_stub = stub; + sock->isock_ntad++; + } + break; + case IMC_TYPE_MC1_MAIN1: + sock->isock_imcs[1].icn_main1 = stub; + break; + case IMC_TYPE_MC0_CHANNEL0: + sock->isock_imcs[0].icn_nchannels++; + sock->isock_imcs[0].icn_channels[0].ich_desc = stub; + break; + case IMC_TYPE_MC0_CHANNEL1: + sock->isock_imcs[0].icn_nchannels++; + sock->isock_imcs[0].icn_channels[1].ich_desc = stub; + break; + case IMC_TYPE_MC0_CHANNEL2: + sock->isock_imcs[0].icn_nchannels++; + sock->isock_imcs[0].icn_channels[2].ich_desc = stub; + break; + case IMC_TYPE_MC0_CHANNEL3: + sock->isock_imcs[0].icn_nchannels++; + sock->isock_imcs[0].icn_channels[3].ich_desc = stub; + break; + case IMC_TYPE_MC1_CHANNEL0: + sock->isock_imcs[1].icn_nchannels++; + sock->isock_imcs[1].icn_channels[0].ich_desc = stub; + break; + case IMC_TYPE_MC1_CHANNEL1: + sock->isock_imcs[1].icn_nchannels++; + sock->isock_imcs[1].icn_channels[1].ich_desc = stub; + break; + case IMC_TYPE_MC1_CHANNEL2: + sock->isock_imcs[1].icn_nchannels++; + sock->isock_imcs[1].icn_channels[2].ich_desc = stub; + break; + case IMC_TYPE_MC1_CHANNEL3: + sock->isock_imcs[1].icn_nchannels++; + sock->isock_imcs[1].icn_channels[3].ich_desc = stub; + break; + case IMC_TYPE_SAD_DRAM: + sock->isock_sad.isad_dram = stub; + break; + case IMC_TYPE_SAD_MMIO: + sock->isock_sad.isad_mmio = stub; + break; + case IMC_TYPE_SAD_MISC: + sock->isock_sad.isad_tolh = stub; + break; + case IMC_TYPE_VTD_MISC: + /* + * Some systems have multiple VT-D Misc. entry points + * in the system. In this case, only use the first one + * we find. + */ + if (imc->imc_gvtd_misc == NULL) { + imc->imc_gvtd_misc = stub; + } + break; + case IMC_TYPE_SAD_MCROUTE: + ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE); + imc_mcroute_check(imc, &sock->isock_sad, stub); + break; + case IMC_TYPE_UBOX: + sock->isock_ubox = stub; + break; + case IMC_TYPE_HA0: + sock->isock_ntad++; + sock->isock_tad[0].itad_stub = stub; + break; + case IMC_TYPE_HA1: + sock->isock_ntad++; + sock->isock_tad[1].itad_stub = stub; + break; + case IMC_TYPE_UBOX_CPUBUSNO: + sock->isock_cpubusno = stub; + break; + default: + /* + * Attempt to still attach if we can. + */ + dev_err(imc->imc_dip, CE_WARN, "Encountered unknown " + "IMC type (%u) on PCI %x,%x", + stub->istub_table->imcs_type, + stub->istub_vid, stub->istub_did); + break; + } + } + + return (B_TRUE); +} + +/* + * Go through and fix up various aspects of the stubs mappings on systems. The + * following are a list of what we need to fix up: + * + * 1. On Haswell and newer systems, there is only one global VT-d device. We + * need to go back and map that to all of the per-socket imc_sad_t entries. + */ +static void +imc_fixup_stubs(imc_t *imc) +{ + if (imc->imc_gen >= IMC_GEN_HASWELL) { + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh, + ==, NULL); + imc->imc_sockets[i].isock_sad.isad_tolh = + imc->imc_gvtd_misc; + } + } +} + +/* + * Attempt to map all of the discovered sockets to the corresponding APIC based + * socket. We do these mappings by getting the node id of the socket and + * adjusting it to make sure that no home agent is present in it. We use the + * UBOX to avoid any home agent related bits that are present in other + * registers. + */ +static void +imc_map_sockets(imc_t *imc) +{ + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + uint32_t nodeid; + ddi_acc_handle_t h; + + h = imc->imc_sockets[i].isock_ubox->istub_cfgspace; + nodeid = pci_config_get32(h, + imc->imc_gen_data->igd_sad_nodeid_offset); + if (nodeid == PCI_EINVAL32) { + imc->imc_sockets[i].isock_valid |= + IMC_SOCKET_V_BAD_NODEID; + continue; + } + + imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid); + imc->imc_spointers[nodeid] = &imc->imc_sockets[i]; + } +} + +/* + * Decode the MTR, accounting for variances between processor generations. + */ +static void +imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr) +{ + uint8_t disable; + + /* + * Check present first, before worrying about anything else. + */ + if (imc->imc_gen < IMC_GEN_SKYLAKE && + IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) { + dimm->idimm_present = B_FALSE; + return; + } else if (imc->imc_gen >= IMC_GEN_SKYLAKE && + IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) { + dimm->idimm_present = B_FALSE; + return; + } + + dimm->idimm_present = B_TRUE; + dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE; + if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN || + dimm->idimm_ncolumns > IMC_MTR_CA_MAX) { + dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS; + } + + dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE; + if (dimm->idimm_nrows < IMC_MTR_RA_MIN || + dimm->idimm_nrows > IMC_MTR_RA_MAX) { + dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS; + } + + /* + * Determine Density, this information is not present on Sandy Bridge. + */ + switch (imc->imc_gen) { + case IMC_GEN_IVY: + dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr); + break; + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) { + case 0: + default: + dimm->idimm_density = 0; + dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY; + break; + case 1: + dimm->idimm_density = 2; + break; + case 2: + dimm->idimm_density = 4; + break; + case 3: + dimm->idimm_density = 8; + break; + } + break; + case IMC_GEN_SKYLAKE: + switch (IMC_MTR_DENSITY_SKX(mtr)) { + case 0: + default: + dimm->idimm_density = 0; + dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY; + break; + case 1: + dimm->idimm_density = 2; + break; + case 2: + dimm->idimm_density = 4; + break; + case 3: + dimm->idimm_density = 8; + break; + case 4: + dimm->idimm_density = 16; + break; + case 5: + dimm->idimm_density = 12; + break; + } + break; + case IMC_GEN_UNKNOWN: + case IMC_GEN_SANDY: + dimm->idimm_density = 0; + break; + } + + /* + * The values of width are the same on IVY->SKX, but the bits are + * different. This doesn't exist on SNB. + */ + if (imc->imc_gen > IMC_GEN_SANDY) { + uint8_t width; + + if (imc->imc_gen >= IMC_GEN_BROADWELL) { + width = IMC_MTR_WIDTH_BRD_SKX(mtr); + } else { + width = IMC_MTR_WIDTH_IVB_HAS(mtr); + } + switch (width) { + case 0: + dimm->idimm_width = 4; + break; + case 1: + dimm->idimm_width = 8; + break; + case 2: + dimm->idimm_width = 16; + break; + default: + dimm->idimm_width = 0; + dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH; + break; + } + } else { + dimm->idimm_width = 0; + } + + dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr); + switch (imc->imc_gen) { + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + case IMC_GEN_SKYLAKE: + if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) { + dimm->idimm_nranks = 0; + dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS; + } + break; + default: + if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) { + dimm->idimm_nranks = 0; + dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS; + } + } + + disable = IMC_MTR_RANK_DISABLE(mtr); + dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0; + dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0; + dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0; + dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0; + + /* + * Only Haswell and later have this information. + */ + if (imc->imc_gen >= IMC_GEN_HASWELL) { + dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0; + dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0; + dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr); + if (dimm->idimm_3dsranks != 0) { + dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks; + } + } + + + if (icn->icn_dimm_type == IMC_DIMM_DDR4) { + dimm->idimm_nbanks = 16; + } else { + dimm->idimm_nbanks = 8; + } + + /* + * To calculate the DIMM size we need first take the number of rows and + * columns. This gives us the number of slots per chip. In a given rank + * there are nbanks of these. There are nrank entries of those. Each of + * these slots can fit a byte. + */ + dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 * + (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows)); +} + +static void +imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan) +{ + uint_t i; + + /* + * There's one register for each DIMM that might be present, we always + * read that information to determine information about the DIMMs. + */ + chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms; + for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) { + uint32_t mtr; + imc_dimm_t *dimm = &chan->ich_dimms[i]; + + bzero(dimm, sizeof (imc_dimm_t)); + mtr = pci_config_get32(chan->ich_desc->istub_cfgspace, + imc->imc_gen_data->igd_mtr_offsets[i]); + dimm->idimm_mtr = mtr; + /* + * We don't really expect to get a bad PCIe read. However, if we + * do, treat that for the moment as though the DIMM is bad. + */ + if (mtr == PCI_EINVAL32) { + dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ; + continue; + } + + imc_decode_mtr(imc, icn, dimm, mtr); + } +} + +static boolean_t +imc_fill_controller(imc_t *imc, imc_mc_t *icn) +{ + uint32_t mcmtr; + + mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace, + imc->imc_gen_data->igd_mcmtr_offset); + if (mcmtr == PCI_EINVAL32) { + icn->icn_invalid = B_TRUE; + return (B_FALSE); + } + + icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0; + if (imc->imc_gen < IMC_GEN_SKYLAKE) { + icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0; + } else { + icn->icn_lockstep = B_FALSE; + } + + icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0; + + /* + * SNB and IVB only support DDR3. Haswell and Broadwell may support + * DDR4, depends on the SKU. Skylake only supports DDR4. + */ + switch (imc->imc_gen) { + case IMC_GEN_SANDY: + case IMC_GEN_IVY: + icn->icn_dimm_type = IMC_DIMM_DDR3; + break; + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) { + icn->icn_dimm_type = IMC_DIMM_DDR4; + } else { + icn->icn_dimm_type = IMC_DIMM_DDR3; + } + break; + default: + /* + * Skylake and on are all DDR4. + */ + icn->icn_dimm_type = IMC_DIMM_DDR4; + break; + } + + if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) { + icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace, + imc->imc_gen_data->igd_topo_offset); + } + + return (B_TRUE); +} + +/* + * Walk the IMC data and fill in the information on DIMMs and the memory + * controller configurations. + */ +static void +imc_fill_data(imc_t *imc) +{ + uint_t csock, cmc, cchan; + + for (csock = 0; csock < imc->imc_nsockets; csock++) { + imc_socket_t *sock = &imc->imc_sockets[csock]; + + for (cmc = 0; cmc < sock->isock_nimc; cmc++) { + imc_mc_t *icn = &sock->isock_imcs[cmc]; + + if (!imc_fill_controller(imc, icn)) + continue; + + for (cchan = 0; cchan < icn->icn_nchannels; cchan++) { + imc_fill_dimms(imc, icn, + &icn->icn_channels[cchan]); + } + } + } +} + +static nvlist_t * +imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm) +{ + nvlist_t *nvl; + + nvl = fnvlist_alloc(); + fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT, + dimm->idimm_present); + if (!dimm->idimm_present) { + return (nvl); + } + + fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS, + dimm->idimm_ncolumns); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS, + dimm->idimm_nrows); + + if (imc->imc_gen > IMC_GEN_SANDY) { + fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY, + dimm->idimm_density * (1ULL << 30)); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH, + dimm->idimm_width); + } + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS, + dimm->idimm_nranks); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS, + dimm->idimm_nbanks); + fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS, + dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE); + + if (imc->imc_gen >= IMC_GEN_HASWELL) { + fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL, + dimm->idimm_hdrl); + fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP, + dimm->idimm_hdrl_parity); + if (dimm->idimm_3dsranks > 0) { + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK, + dimm->idimm_3dsranks); + } + } + + return (nvl); +} + +static nvlist_t * +imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan) +{ + nvlist_t *nvl; + nvlist_t *dimms[IMC_MAX_DIMMPERCHAN]; + uint_t i; + + nvl = fnvlist_alloc(); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC, + imc->imc_gen_data->igd_max_dimms); + for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) { + dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]); + } + + fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS, + dimms, i); + + for (; i > 0; i--) { + nvlist_free(dimms[i-1]); + } + + return (nvl); +} + +static nvlist_t * +imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn) +{ + nvlist_t *nvl; + nvlist_t *channels[IMC_MAX_CHANPERMC]; + uint_t i; + + nvl = fnvlist_alloc(); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels); + fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC, + icn->icn_ecc); + if (icn->icn_lockstep) { + fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE, + MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK); + } else { + fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE, + MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP); + + } + + if (icn->icn_closed) { + fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY, + MCINTEL_NVLIST_V1_MC_POLICY_CLOSED); + } else { + fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY, + MCINTEL_NVLIST_V1_MC_POLICY_OPEN); + } + + for (i = 0; i < icn->icn_nchannels; i++) { + channels[i] = imc_nvl_create_channel(imc, + &icn->icn_channels[i]); + } + fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS, + channels, icn->icn_nchannels); + for (i = 0; i < icn->icn_nchannels; i++) { + nvlist_free(channels[i]); + } + + return (nvl); +} + +static void +imc_nvl_pack(imc_socket_t *sock, boolean_t sleep) +{ + char *buf = NULL; + size_t len = 0; + int kmflag; + + if (sock->isock_nvl == NULL) + return; + + if (sock->isock_buf != NULL) + return; + + if (sleep) { + kmflag = KM_SLEEP; + } else { + kmflag = KM_NOSLEEP | KM_NORMALPRI; + } + + if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR, + kmflag) != 0) { + return; + } + + sock->isock_buf = buf; + sock->isock_buflen = len; + sock->isock_gen++; +} + +static void +imc_decoder_pack(imc_t *imc) +{ + char *buf = NULL; + size_t len = 0; + + if (imc->imc_decoder_buf != NULL) + return; + + if (imc->imc_decoder_dump == NULL) { + imc->imc_decoder_dump = imc_dump_decoder(imc); + } + + if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR, + KM_NOSLEEP | KM_NORMALPRI) != 0) { + return; + } + + imc->imc_decoder_buf = buf; + imc->imc_decoder_len = len; +} + +static void +imc_nvl_create(imc_t *imc) +{ + uint_t csock; + for (csock = 0; csock < imc->imc_nsockets; csock++) { + uint_t i; + nvlist_t *nvl; + nvlist_t *mcs[IMC_MAX_IMCPERSOCK]; + imc_socket_t *sock = &imc->imc_sockets[csock]; + + nvl = fnvlist_alloc(); + fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR, + MCINTEL_NVLIST_VERS1); + fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC, + sock->isock_nimc); + + for (i = 0; i < sock->isock_nimc; i++) { + mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]); + } + + fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS, + mcs, sock->isock_nimc); + + for (i = 0; i < sock->isock_nimc; i++) { + nvlist_free(mcs[i]); + } + + sock->isock_nvl = nvl; + imc_nvl_pack(sock, B_TRUE); + } +} + +/* + * Determine the top of low and high memory. These determine whether transaction + * addresses target main memory or not. Unfortunately, the way that these are + * stored and fetched changes with different generations. + */ +static void +imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad) +{ + uint32_t tolm, tohm_low, tohm_hi; + + tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace, + imc->imc_gen_data->igd_tolm_offset); + tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace, + imc->imc_gen_data->igd_tohm_low_offset); + if (imc->imc_gen_data->igd_tohm_hi_offset != 0) { + tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace, + imc->imc_gen_data->igd_tohm_hi_offset); + } else { + tohm_hi = 0; + } + + if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 || + tohm_hi == PCI_EINVAL32) { + sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; + return; + } + + switch (imc->imc_gen) { + case IMC_GEN_SANDY: + case IMC_GEN_IVY: + sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) << + IMC_TOLM_SNB_IVY_SHIFT; + sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) << + IMC_TOLM_SNB_IVY_SHIFT; + break; + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + case IMC_GEN_SKYLAKE: + sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK; + sad->isad_tohm = ((uint64_t)tohm_low & + IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32); + + /* + * Adjust the values to turn them into an exclusive range. + */ + sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL; + sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL; + break; + default: + dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: " + "set to unknown generation: %u", imc->imc_gen); + return; + } +} + +static void +imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule, + uint32_t raw) +{ + uint_t attr; + uint64_t limit; + bzero(rule, sizeof (imc_sad_rule_t)); + + rule->isr_raw_dram = raw; + rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0; + if (imc->imc_gen < IMC_GEN_SKYLAKE) { + switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) { + case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6: + rule->isr_imode = IMC_SAD_IMODE_8t6; + break; + case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR: + rule->isr_imode = IMC_SAD_IMODE_8t6XOR; + break; + } + } else { + switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) { + case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6: + rule->isr_imode = IMC_SAD_IMODE_8t6; + break; + case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8: + rule->isr_imode = IMC_SAD_IMODE_10t8; + break; + case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12: + rule->isr_imode = IMC_SAD_IMODE_14t12; + break; + case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30: + rule->isr_imode = IMC_SAD_IMODE_32t30; + break; + } + } + + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + attr = IMC_SAD_DRAM_ATTR_SKX(raw); + } else { + attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw); + } + + switch (attr) { + case IMC_SAD_DRAM_ATTR_DRAM: + rule->isr_type = IMC_SAD_TYPE_DRAM; + break; + case IMC_SAD_DRAM_ATTR_MMCFG: + rule->isr_type = IMC_SAD_TYPE_MMCFG; + break; + case IMC_SAD_DRAM_ATTR_NXM: + if (imc->imc_gen < IMC_GEN_SKYLAKE) { + sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR; + } + rule->isr_type = IMC_SAD_TYPE_NXM; + break; + default: + sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR; + break; + } + + /* + * Fetch the limit which represents bits 45:26 and then adjust this so + * that it is exclusive. + */ + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + limit = IMC_SAD_DRAM_LIMIT_SKX(raw); + } else { + limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw); + } + rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) + + IMC_SAD_DRAM_LIMIT_EXCLUSIVE; + + /* + * The rest of this does not apply to Sandy Bridge. + */ + if (imc->imc_gen == IMC_GEN_SANDY) + return; + + if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) { + rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0; + return; + } + + switch (IMC_SAD_DRAM_MOD23_SKX(raw)) { + case IMC_SAD_DRAM_MOD23_MOD3: + rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3; + break; + case IMC_SAD_DRAM_MOD23_MOD2_C01: + rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01; + break; + case IMC_SAD_DRAM_MOD23_MOD2_C12: + rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12; + break; + case IMC_SAD_DRAM_MOD23_MOD2_C02: + rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02; + break; + } + + rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0; + switch (IMC_SAD_DRAM_MOD3_SKX(raw)) { + case IMC_SAD_DRAM_MOD3_MODE_45t6: + rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6; + break; + case IMC_SAD_DRAM_MOD3_MODE_45t8: + rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8; + break; + case IMC_SAD_DRAM_MOD3_MODE_45t12: + rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12; + break; + default: + sad->isad_valid |= IMC_SAD_V_BAD_MOD3; + break; + } +} + +static void +imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw) +{ + uint_t i; + uint32_t mlen, mbase, skipbits, skipafter; + + rule->isr_raw_interleave = raw; + + /* + * Right now all architectures always have the maximum number of SAD + * interleave targets. + */ + rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE; + + /* + * Sandy Bridge has a gap in the interleave list due to the fact that it + * uses a smaller length. + */ + if (imc->imc_gen > IMC_GEN_SANDY) { + mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN; + mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK; + skipbits = skipafter = 0; + } else { + mlen = IMC_SAD_ILEAVE_SNB_LEN; + mbase = IMC_SAD_ILEAVE_SNB_MASK; + skipbits = 2; + skipafter = 4; + } + + for (i = 0; i < rule->isr_ntargets; i++) { + uint32_t mask, shift; + + shift = i * mlen; + if (i >= skipafter) + shift += skipbits; + mask = mbase << shift; + rule->isr_targets[i] = (raw & mask) >> shift; + } +} + +static void +imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad) +{ + uint_t i; + off_t off; + + sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules; + for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset; + i < sad->isad_nrules; i++, off += sizeof (uint64_t)) { + uint32_t dram, interleave; + imc_sad_rule_t *rule = &sad->isad_rules[i]; + + dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off); + interleave = pci_config_get32(sad->isad_dram->istub_cfgspace, + off + 4); + + if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) { + sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; + return; + } + + imc_sad_fill_rule(imc, sad, rule, dram); + imc_sad_fill_rule_interleave(imc, rule, interleave); + } +} + +static void +imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad) +{ + uint_t i; + imc_sad_mcroute_table_t *mc = &sad->isad_mcroute; + + if (imc->imc_gen < IMC_GEN_SKYLAKE) + return; + if (sad->isad_valid != 0) + return; + + mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES; + for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) { + uint_t chanoff, ringoff; + + ringoff = i * IMC_MC_ROUTE_RING_BITS; + chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET; + + mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >> + ringoff) & IMC_MC_ROUTE_RING_MASK; + mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >> + chanoff) & IMC_MC_ROUTE_CHAN_MASK; + } +} + +/* + * Initialize the SAD. To do this we have to do a few different things: + * + * 1. Determine where the top of low and high memory is. + * 2. Read and decode all of the rules for the SAD + * 3. On systems with a route table, decode the raw routes + * + * At this point in time, we treat TOLM and TOHM as a per-socket construct, even + * though it really should be global, this just makes life a bit simpler. + */ +static void +imc_decoder_init_sad(imc_t *imc) +{ + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad); + imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad); + imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad); + } +} + +static void +imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev, + imc_tad_rule_t *rule, uint32_t val) +{ + uint64_t limit; + + limit = IMC_TAD_LIMIT(val); + rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) + + IMC_TAD_LIMIT_EXCLUSIVE; + rule->itr_raw = val; + + switch (IMC_TAD_SOCK_WAY(val)) { + case IMC_TAD_SOCK_WAY_1: + rule->itr_sock_way = 1; + break; + case IMC_TAD_SOCK_WAY_2: + rule->itr_sock_way = 2; + break; + case IMC_TAD_SOCK_WAY_4: + rule->itr_sock_way = 4; + break; + case IMC_TAD_SOCK_WAY_8: + rule->itr_sock_way = 8; + break; + } + + rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1; + rule->itr_sock_gran = IMC_TAD_GRAN_64B; + rule->itr_chan_gran = IMC_TAD_GRAN_64B; + + /* + * Starting with Skylake the targets that are used are no longer part of + * the TAD. Those come from the IMC route table. + */ + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + rule->itr_ntargets = 0; + return; + } + + rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS; + rule->itr_targets[0] = IMC_TAD_TARG0(val); + rule->itr_targets[1] = IMC_TAD_TARG1(val); + rule->itr_targets[2] = IMC_TAD_TARG2(val); + rule->itr_targets[3] = IMC_TAD_TARG3(val); + + if (prev == NULL) { + rule->itr_base = 0; + } else { + rule->itr_base = prev->itr_limit + 1; + } +} + +static void +imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule, + uint32_t val) +{ + uint64_t base; + + rule->itr_raw_gran = val; + base = IMC_TAD_BASE_BASE(val); + rule->itr_base = base << IMC_TAD_BASE_SHIFT; + + switch (IMC_TAD_BASE_CHAN_GRAN(val)) { + case IMC_TAD_BASE_CHAN_GRAN_64B: + rule->itr_sock_gran = IMC_TAD_GRAN_64B; + break; + case IMC_TAD_BASE_CHAN_GRAN_256B: + rule->itr_sock_gran = IMC_TAD_GRAN_256B; + break; + case IMC_TAD_BASE_CHAN_GRAN_4KB: + rule->itr_sock_gran = IMC_TAD_GRAN_4KB; + break; + default: + tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN; + return; + } + + switch (IMC_TAD_BASE_SOCK_GRAN(val)) { + case IMC_TAD_BASE_SOCK_GRAN_64B: + rule->itr_sock_gran = IMC_TAD_GRAN_64B; + break; + case IMC_TAD_BASE_SOCK_GRAN_256B: + rule->itr_sock_gran = IMC_TAD_GRAN_256B; + break; + case IMC_TAD_BASE_SOCK_GRAN_4KB: + rule->itr_sock_gran = IMC_TAD_GRAN_4KB; + break; + case IMC_TAD_BASE_SOCK_GRAN_1GB: + rule->itr_sock_gran = IMC_TAD_GRAN_1GB; + break; + } +} + +/* + * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's + * suggested that the channel wayness will take this into account and therefore + * should be accurately reflected. + */ +static void +imc_tad_read_rules(imc_t *imc, imc_tad_t *tad) +{ + uint_t i; + off_t baseoff; + imc_tad_rule_t *prev; + + tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules; + for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset, + prev = NULL; i < tad->itad_nrules; + i++, baseoff += sizeof (uint32_t)) { + uint32_t val; + off_t off; + imc_tad_rule_t *rule = &tad->itad_rules[i]; + + /* + * On Skylake, the TAD rules are split among two registers. The + * latter set mimics what exists on pre-Skylake. + */ + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + off = baseoff + IMC_SKX_WAYNESS_OFFSET; + } else { + off = baseoff; + } + + val = pci_config_get32(tad->itad_stub->istub_cfgspace, off); + if (val == PCI_EINVAL32) { + tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; + return; + } + + imc_tad_fill_rule(imc, tad, prev, rule, val); + prev = rule; + if (imc->imc_gen < IMC_GEN_SKYLAKE) + continue; + + val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff); + if (val == PCI_EINVAL32) { + tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; + return; + } + + imc_tad_fill_skx(imc, tad, rule, val); + } +} + +/* + * Check for features which change how decoding works. + */ +static void +imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc) +{ + uint32_t val; + + /* + * Determine whether or not lockstep mode or mirroring are enabled. + * These change the behavior of how we're supposed to interpret channel + * wayness. Lockstep is available in the TAD's features. Mirroring is + * available on the IMC's features. This isn't present in Skylake+. On + * Skylake Mirorring is a property of the SAD rule and there is no + * lockstep. + */ + switch (imc->imc_gen) { + case IMC_GEN_SANDY: + case IMC_GEN_IVY: + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + val = pci_config_get32(tad->itad_stub->istub_cfgspace, + imc->imc_gen_data->igd_tad_sysdef); + if (val == PCI_EINVAL32) { + tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; + return; + } + if (IMC_TAD_SYSDEF_LOCKSTEP(val)) { + tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP; + } + + val = pci_config_get32(mc->icn_main1->istub_cfgspace, + imc->imc_gen_data->igd_mc_mirror); + if (val == PCI_EINVAL32) { + tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; + return; + } + if (IMC_MC_MIRROR_SNB_BRD(val)) { + tad->itad_flags |= IMC_TAD_FLAG_MIRROR; + } + break; + default: + break; + } + + /* + * Now, go through and look at values that'll change how we do the + * channel index and adddress calculation. These are only present + * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge + * and they don't exist on Skylake+. + */ + switch (imc->imc_gen) { + case IMC_GEN_IVY: + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + val = pci_config_get32(tad->itad_stub->istub_cfgspace, + imc->imc_gen_data->igd_tad_sysdef2); + if (val == PCI_EINVAL32) { + tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; + return; + } + if (IMC_TAD_SYSDEF2_SHIFTUP(val)) { + tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT; + } + if (IMC_TAD_SYSDEF2_SHIFTUP(val)) { + tad->itad_flags |= IMC_TAD_FLAG_CHANHASH; + } + break; + default: + break; + } +} + +/* + * Read the IMC channel interleave records + */ +static void +imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan) +{ + uint_t i; + off_t off; + + chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules; + for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset; + i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) { + uint32_t val; + uint64_t offset; + + val = pci_config_get32(chan->ich_desc->istub_cfgspace, + off); + if (val == PCI_EINVAL32) { + chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; + return; + } + + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + offset = IMC_TADCHAN_OFFSET_SKX(val); + } else { + offset = IMC_TADCHAN_OFFSET_SNB_BRD(val); + } + + chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT; + chan->ich_tad_offsets_raw[i] = val; + } +} + +static void +imc_decoder_init_tad(imc_t *imc) +{ + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + uint_t j; + + for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) { + imc_tad_read_features(imc, + &imc->imc_sockets[i].isock_tad[j], + &imc->imc_sockets[i].isock_imcs[j]); + imc_tad_read_rules(imc, + &imc->imc_sockets[i].isock_tad[j]); + } + } + + for (i = 0; i < imc->imc_nsockets; i++) { + uint_t j; + imc_socket_t *sock = &imc->imc_sockets[i]; + + for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) { + uint_t k; + imc_mc_t *mc = &sock->isock_imcs[j]; + + for (k = 0; k < mc->icn_nchannels; k++) { + imc_channel_t *chan = &mc->icn_channels[k]; + imc_tad_read_interleave(imc, chan); + } + } + } +} + +static void +imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan, + imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig) +{ + uint_t i; + off_t off, incr; + + /* + * Rank interleave offset registers come in two forms. Either they are + * contiguous for a given wayness, meaning that all of the entries for + * wayness zero are contiguous, or they are sparse, meaning that there + * is a bank for entry zero for all wayness, then entry one for all + * wayness, etc. + */ + if (contig) { + off = imc->imc_gen_data->igd_rir_ileave_offset + + (rirno * imc->imc_gen_data->igd_rir_nileaves * + sizeof (uint32_t)); + incr = sizeof (uint32_t); + } else { + off = imc->imc_gen_data->igd_rir_ileave_offset + + (rirno * sizeof (uint32_t)); + incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t); + } + for (i = 0; i < rank->irle_nentries; i++, off += incr) { + uint32_t val; + uint64_t offset; + imc_rank_ileave_entry_t *ent = &rank->irle_entries[i]; + + val = pci_config_get32(chan->ich_desc->istub_cfgspace, off); + if (val == PCI_EINVAL32) { + chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; + return; + } + + switch (imc->imc_gen) { + case IMC_GEN_BROADWELL: + ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val); + break; + default: + ent->irle_target = IMC_RIR_OFFSET_TARGET(val); + break; + } + if (imc->imc_gen >= IMC_GEN_HASWELL) { + offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val); + } else { + offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val); + } + ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT; + } +} + +static void +imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan) +{ + uint_t i; + off_t off; + + chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways; + for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset; + i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) { + uint32_t val; + uint64_t lim; + imc_rank_ileave_t *ent = &chan->ich_rankileaves[i]; + + val = pci_config_get32(chan->ich_desc->istub_cfgspace, off); + if (val == PCI_EINVAL32) { + chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; + return; + } + + ent->irle_raw = val; + ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0; + ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val); + ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val); + if (imc->imc_gen >= IMC_GEN_HASWELL) { + lim = IMC_RIR_LIMIT_HAS_SKX(val); + } else { + lim = IMC_RIR_LIMIT_SNB_IVB(val); + } + + ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) + + IMC_RIR_LIMIT_EXCLUSIVE; + + ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves; + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE); + } else { + imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE); + } + } +} + +static void +imc_decoder_init_rir(imc_t *imc) +{ + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + uint_t j; + imc_socket_t *sock = &imc->imc_sockets[i]; + + for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) { + uint_t k; + imc_mc_t *mc = &sock->isock_imcs[j]; + + for (k = 0; k < mc->icn_nchannels; k++) { + imc_channel_t *chan = &mc->icn_channels[k]; + imc_rir_read_wayness(imc, chan); + } + } + } +} + +static cmi_errno_t +imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo, + uint32_t synd, int syndtype, mc_unum_t *unump) +{ + imc_t *imc = arg; + uint_t i; + imc_decode_state_t dec; + + bzero(&dec, sizeof (dec)); + if (!imc_decode_pa(imc, pa, &dec)) { + switch (dec.ids_fail) { + case IMC_DECODE_F_LEGACY_RANGE: + case IMC_DECODE_F_OUTSIDE_DRAM: + return (CMIERR_MC_NOTDIMMADDR); + default: + return (CMIERR_MC_BADSTATE); + } + } + + unump->unum_board = 0; + /* + * The chip id needs to be in the order that the OS expects it, which + * may not be our order. + */ + for (i = 0; i < imc->imc_nsockets; i++) { + if (imc->imc_spointers[i] == dec.ids_socket) + break; + } + if (i == imc->imc_nsockets) { + return (CMIERR_MC_BADSTATE); + } + unump->unum_chip = i; + unump->unum_mc = dec.ids_tadid; + unump->unum_chan = dec.ids_channelid; + unump->unum_cs = dec.ids_dimmid; + unump->unum_rank = dec.ids_rankid; + unump->unum_offset = dec.ids_rankaddr; + for (i = 0; i < MC_UNUM_NDIMM; i++) { + unump->unum_dimms[i] = MC_INVALNUM; + } + + return (CMI_SUCCESS); +} + +static cmi_errno_t +imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa) +{ + return (CMIERR_UNKNOWN); +} + +static const cmi_mc_ops_t imc_mc_ops = { + .cmi_mc_patounum = imc_mc_patounum, + .cmi_mc_unumtopa = imc_mc_unumtopa +}; + +/* + * This is where we really finish attaching and become open for business. This + * occurs once we have all of the expected stubs attached. Here's where all of + * the real fun begins. + */ +static void +imc_attach_complete(void *arg) +{ + imc_t *imc = arg; + cmi_errno_t err; + + imc_set_gen_data(imc); + + /* + * On SKX and newer, we can fail to map PCI buses at this point due to + * bad PCIe reads. + */ + if (!imc_map_stubs(imc)) { + goto done; + } + + imc_fixup_stubs(imc); + imc_map_sockets(imc); + + if (!imc_create_minors(imc)) { + goto done; + } + + imc_fill_data(imc); + imc_nvl_create(imc); + + /* + * Gather additional information that we need so that we can properly + * initialize the memory decoder and encoder. + */ + imc_decoder_init_sad(imc); + imc_decoder_init_tad(imc); + imc_decoder_init_rir(imc); + + /* + * Register decoder functions. This may fail. If so, try and complain + * loudly, but stay active to allow other data to be useful. Register a + * global handle. + */ + if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) { + imc->imc_flags |= IMC_F_MCREG_FAILED; + dev_err(imc->imc_dip, CE_WARN, "failed to register memory " + "decoding operations: 0x%x", err); + } + +done: + mutex_enter(&imc->imc_lock); + imc->imc_flags &= IMC_F_ATTACH_DISPATCHED; + imc->imc_flags |= IMC_F_ATTACH_COMPLETE; + mutex_exit(&imc->imc_lock); +} + +static int +imc_stub_comparator(const void *l, const void *r) +{ + const imc_stub_t *sl = l, *sr = r; + if (sl->istub_bus > sr->istub_bus) + return (1); + if (sl->istub_bus < sr->istub_bus) + return (-1); + if (sl->istub_dev > sr->istub_dev) + return (1); + if (sl->istub_dev < sr->istub_dev) + return (-1); + if (sl->istub_func > sr->istub_func) + return (1); + if (sl->istub_func < sr->istub_func) + return (-1); + return (0); +} + +static int +imc_stub_scan_cb(dev_info_t *dip, void *arg) +{ + int vid, did; + const imc_stub_table_t *table; + imc_t *imc = arg; + int *regs; + uint_t i, nregs; + + if (dip == ddi_root_node()) { + return (DDI_WALK_CONTINUE); + } + + /* + * Get the dev info name. PCI devices will always be children of PCI + * devices today on x86. If we reach something that has a device name + * that's not PCI, then we can prune it's children. + */ + if (strncmp("pci", ddi_get_name(dip), 3) != 0) { + return (DDI_WALK_PRUNECHILD); + } + + /* + * Get the device and vendor ID and see if this is something the imc + * knows about or cares about. + */ + vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "vendor-id", PCI_EINVAL16); + did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "device-id", PCI_EINVAL16); + if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) { + return (DDI_WALK_CONTINUE); + } + + if (vid != IMC_PCI_VENDOR_INTC) { + return (DDI_WALK_PRUNECHILD); + } + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "reg", ®s, &nregs) != DDI_PROP_SUCCESS) { + return (DDI_WALK_CONTINUE); + } + + if (nregs == 0) { + ddi_prop_free(regs); + return (DDI_WALK_CONTINUE); + } + + + table = NULL; + for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) { + if (imc_stub_table[i].imcs_devid == did && + imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) && + imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) { + table = &imc_stub_table[i]; + break; + } + } + ddi_prop_free(regs); + + /* + * Not a match, not interesting. + */ + if (table == NULL) { + return (DDI_WALK_CONTINUE); + } + + mutex_enter(&imc->imc_lock); + imc->imc_nscanned++; + mutex_exit(&imc->imc_lock); + + return (DDI_WALK_CONTINUE); +} + +/* + * From here, go through and see how many of the devices that we know about. + */ +static void +imc_stub_scan(void *arg) +{ + imc_t *imc = arg; + boolean_t dispatch = B_FALSE; + + /* + * Zero out the scan results in case we've been detached and reattached. + */ + mutex_enter(&imc->imc_lock); + imc->imc_nscanned = 0; + mutex_exit(&imc->imc_lock); + + ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc); + + mutex_enter(&imc->imc_lock); + imc->imc_flags |= IMC_F_SCAN_COMPLETE; + imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED; + + /* + * If the scan found no nodes, then that means that we're on a hardware + * platform that we don't support. Therefore, there's no reason to do + * anything here. + */ + if (imc->imc_nscanned == 0) { + imc->imc_flags |= IMC_F_UNSUP_PLATFORM; + mutex_exit(&imc->imc_lock); + return; + } + + if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) { + imc->imc_flags |= IMC_F_ATTACH_DISPATCHED; + dispatch = B_TRUE; + } + + mutex_exit(&imc->imc_lock); + + if (dispatch) { + (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete, + imc, DDI_SLEEP); + } +} + +/* + * By default, refuse to allow stubs to detach. + */ +int +imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + imc_stub_t *stub; + imc_t *imc = imc_data; + + mutex_enter(&imc->imc_lock); + + /* + * By default, we do not allow stubs to detach. However, if the driver + * has attached to devices on a platform it doesn't recognize or + * support or if the override flag has been set, then allow detach to + * proceed. + */ + if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 && + imc_allow_detach == 0) { + mutex_exit(&imc->imc_lock); + return (DDI_FAILURE); + } + + for (stub = avl_first(&imc->imc_stubs); stub != NULL; + stub = AVL_NEXT(&imc->imc_stubs, stub)) { + if (stub->istub_dip == dip) { + break; + } + } + + /* + * A device was attached to us that we somehow don't know about. Allow + * this to proceed. + */ + if (stub == NULL) { + mutex_exit(&imc->imc_lock); + return (DDI_SUCCESS); + } + + pci_config_teardown(&stub->istub_cfgspace); + avl_remove(&imc->imc_stubs, stub); + kmem_free(stub, sizeof (imc_stub_t)); + mutex_exit(&imc->imc_lock); + + return (DDI_SUCCESS); +} + +int +imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + imc_stub_t *stub, *lookup; + int did, vid, *regs; + uint_t i, nregs; + const imc_stub_table_t *table; + avl_index_t idx; + boolean_t dispatch = B_FALSE; + imc_t *imc = imc_data; + + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + /* + * We've been asked to attach a stub. First, determine if this is even a + * PCI device that we should care about. Then, append it to our global + * list and kick off the configuration task. Note that we do this + * configuration task in a taskq so that we don't interfere with the + * normal attach / detach path processing. + */ + if (strncmp("pci", ddi_get_name(dip), 3) != 0) { + return (DDI_FAILURE); + } + + /* + * Get the device and vendor ID and see if this is something the imc + * knows about or cares about. + */ + vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "vendor-id", PCI_EINVAL16); + did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "device-id", PCI_EINVAL16); + if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) { + return (DDI_FAILURE); + } + + /* + * Only accept INTC parts on the imc driver. + */ + if (vid != IMC_PCI_VENDOR_INTC) { + return (DDI_FAILURE); + } + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "reg", ®s, &nregs) != DDI_PROP_SUCCESS) { + return (DDI_FAILURE); + } + + if (nregs == 0) { + ddi_prop_free(regs); + return (DDI_FAILURE); + } + + /* + * Determine if this matches a known device. + */ + table = NULL; + for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) { + if (imc_stub_table[i].imcs_devid == did && + imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) && + imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) { + table = &imc_stub_table[i]; + break; + } + } + + if (i == ARRAY_SIZE(imc_stub_table)) { + ddi_prop_free(regs); + return (DDI_FAILURE); + } + + /* + * We've found something. Make sure the generation matches our current + * one. If it does, construct the entry and append it to the list. + */ + mutex_enter(&imc->imc_lock); + if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen != + table->imcs_gen) { + mutex_exit(&imc->imc_lock); + ddi_prop_free(regs); + dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) " + "that has different hardware generation (%u) from current " + "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen); + return (DDI_FAILURE); + } else { + imc->imc_gen = table->imcs_gen; + } + mutex_exit(&imc->imc_lock); + + stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP); + stub->istub_dip = dip; + stub->istub_vid = vid; + stub->istub_did = did; + stub->istub_bus = PCI_REG_BUS_G(regs[0]); + stub->istub_dev = PCI_REG_DEV_G(regs[0]); + stub->istub_func = PCI_REG_FUNC_G(regs[0]); + ddi_prop_free(regs); + stub->istub_table = table; + + if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) { + kmem_free(stub, sizeof (stub)); + dev_err(dip, CE_WARN, "Failed to set up PCI config space " + "for IMC stub device %s (%u/%u)", ddi_node_name(dip), + vid, did); + return (DDI_FAILURE); + } + + mutex_enter(&imc->imc_lock); + if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) { + dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate " + "bdf %u/%u/%u with %s (%u/%u), not attaching", + ddi_node_name(imc->imc_dip), vid, did, + stub->istub_bus, stub->istub_dev, stub->istub_func, + ddi_node_name(lookup->istub_dip), lookup->istub_vid, + lookup->istub_did); + mutex_exit(&imc->imc_lock); + pci_config_teardown(&stub->istub_cfgspace); + kmem_free(stub, sizeof (stub)); + + return (DDI_FAILURE); + } + avl_insert(&imc->imc_stubs, stub, idx); + + if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE && + avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) { + imc->imc_flags |= IMC_F_ATTACH_DISPATCHED; + dispatch = B_TRUE; + } + mutex_exit(&imc->imc_lock); + + if (dispatch) { + (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete, + imc, DDI_SLEEP); + } + + return (DDI_SUCCESS); +} + +static int +imc_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + imc_t *imc = imc_data; + + if ((flag & (FEXCL | FNDELAY)) != 0) + return (EINVAL); + + if (otyp != OTYP_CHR) + return (EINVAL); + + mutex_enter(&imc->imc_lock); + + if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) { + mutex_exit(&imc->imc_lock); + return (ENOTSUP); + } + + /* + * It's possible that someone has come in during the window between when + * we've created the minor node and when we've finished doing work. + */ + if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) { + mutex_exit(&imc->imc_lock); + return (EAGAIN); + } + + /* + * It's not clear how someone would get a minor that we didn't create. + * But be paranoid and make sure. + */ + if (getminor(*devp) >= imc->imc_nsockets) { + mutex_exit(&imc->imc_lock); + return (EINVAL); + } + + /* + * Make sure this socket entry has been filled in. + */ + if (imc->imc_spointers[getminor(*devp)] == NULL) { + mutex_exit(&imc->imc_lock); + return (EINVAL); + } + + mutex_exit(&imc->imc_lock); + + return (0); +} + +static void +imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode) +{ + imc_decode_state_t dec; + uint_t i; + + bzero(&dec, sizeof (dec)); + if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) { + encode->mcei_err = (uint32_t)dec.ids_fail; + encode->mcei_errdata = dec.ids_fail_data; + return; + } + + encode->mcei_errdata = 0; + encode->mcei_err = 0; + encode->mcei_board = 0; + for (i = 0; i < imc->imc_nsockets; i++) { + if (imc->imc_spointers[i] == dec.ids_socket) + break; + } + encode->mcei_chip = i; + encode->mcei_mc = dec.ids_tadid; + encode->mcei_chan = dec.ids_channelid; + encode->mcei_dimm = dec.ids_dimmid; + encode->mcei_rank_addr = dec.ids_rankaddr; + encode->mcei_rank = dec.ids_rankid; + encode->mcei_row = UINT32_MAX; + encode->mcei_column = UINT32_MAX; + encode->mcei_pad = 0; +} + +static int +imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int ret; + minor_t m; + mc_snapshot_info_t info; + mc_encode_ioc_t encode; + imc_t *imc = imc_data; + imc_socket_t *sock; + + mutex_enter(&imc->imc_lock); + m = getminor(dev); + if (m >= imc->imc_nsockets) { + ret = EINVAL; + goto done; + } + sock = imc->imc_spointers[m]; + if (sock == NULL) { + ret = EINVAL; + goto done; + } + + /* + * Note, other memory controller drivers don't check mode for reading + * data nor do they care who can read it from a credential perspective. + * As such we don't either at this time. + */ + switch (cmd) { + case MC_IOC_SNAPSHOT_INFO: + imc_nvl_pack(sock, B_FALSE); + if (sock->isock_buf == NULL) { + ret = EIO; + break; + } + + info.mcs_size = sock->isock_buflen; + info.mcs_gen = sock->isock_gen; + + if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) { + ret = EFAULT; + break; + } + + ret = 0; + break; + case MC_IOC_SNAPSHOT: + imc_nvl_pack(sock, B_FALSE); + if (sock->isock_buf == NULL) { + ret = EIO; + break; + } + + if (ddi_copyout(sock->isock_buf, (void *)arg, + sock->isock_buflen, mode) != 0) { + ret = EFAULT; + break; + } + + ret = 0; + break; + case MC_IOC_DECODE_SNAPSHOT_INFO: + imc_decoder_pack(imc); + if (imc->imc_decoder_buf == NULL) { + ret = EIO; + break; + } + + info.mcs_size = imc->imc_decoder_len; + info.mcs_gen = imc->imc_spointers[0]->isock_gen; + + if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) { + ret = EFAULT; + break; + } + + ret = 0; + break; + case MC_IOC_DECODE_SNAPSHOT: + imc_decoder_pack(imc); + if (imc->imc_decoder_buf == NULL) { + ret = EIO; + break; + } + + if (ddi_copyout(imc->imc_decoder_buf, (void *)arg, + imc->imc_decoder_len, mode) != 0) { + ret = EFAULT; + break; + } + + ret = 0; + break; + case MC_IOC_DECODE_PA: + if (crgetzoneid(credp) != GLOBAL_ZONEID || + drv_priv(credp) != 0) { + ret = EPERM; + break; + } + + if (ddi_copyin((void *)arg, &encode, sizeof (encode), + mode & FKIOCTL) != 0) { + ret = EPERM; + break; + } + + imc_ioctl_decode(imc, &encode); + ret = 0; + + if (ddi_copyout(&encode, (void *)arg, sizeof (encode), + mode & FKIOCTL) != 0) { + ret = EPERM; + break; + } + break; + default: + ret = EINVAL; + goto done; + } + +done: + mutex_exit(&imc->imc_lock); + return (ret); +} + +static int +imc_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + return (0); +} + +static int +imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + if (imc_data == NULL || imc_data->imc_dip != NULL) { + return (DDI_FAILURE); + } + + mutex_enter(&imc_data->imc_lock); + if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1, + TASKQ_DEFAULTPRI, 0)) == NULL) { + mutex_exit(&imc_data->imc_lock); + return (DDI_FAILURE); + } + + imc_data->imc_dip = dip; + imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED; + mutex_exit(&imc_data->imc_lock); + + (void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data, + DDI_SLEEP); + + return (DDI_SUCCESS); +} + +/* + * We only export a single instance. + */ +static int +imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp) +{ + /* + * getinfo(9E) shouldn't be called if we're not attached. But be + * paranoid. + */ + if (imc_data == NULL || imc_data->imc_dip == NULL) { + return (DDI_FAILURE); + } + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *resultp = imc_data->imc_dip; + break; + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)0; + break; + default: + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static int +imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + if (imc_data == NULL || imc_data->imc_dip) { + return (DDI_FAILURE); + } + + mutex_enter(&imc_data->imc_lock); + + /* + * While a scan or attach is outstanding, don't allow us to detach. + */ + if ((imc_data->imc_flags & + (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) { + mutex_exit(&imc_data->imc_lock); + return (DDI_FAILURE); + } + + /* + * Because the stub driver depends on the imc driver, we shouldn't be + * able to have any entries in this list when we detach. However, we + * check just to make sure. + */ + if (!avl_is_empty(&imc_data->imc_stubs)) { + mutex_exit(&imc_data->imc_lock); + return (DDI_FAILURE); + } + + nvlist_free(imc_data->imc_decoder_dump); + imc_data->imc_decoder_dump = NULL; + if (imc_data->imc_decoder_buf != NULL) { + kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len); + imc_data->imc_decoder_buf = NULL; + imc_data->imc_decoder_len = 0; + } + + ddi_remove_minor_node(imc_data->imc_dip, NULL); + imc_data->imc_dip = NULL; + mutex_exit(&imc_data->imc_lock); + + ddi_taskq_wait(imc_data->imc_taskq); + ddi_taskq_destroy(imc_data->imc_taskq); + imc_data->imc_taskq = NULL; + + return (DDI_SUCCESS); +} + +static void +imc_free(void) +{ + if (imc_data == NULL) { + return; + } + + VERIFY(avl_is_empty(&imc_data->imc_stubs)); + avl_destroy(&imc_data->imc_stubs); + mutex_destroy(&imc_data->imc_lock); + kmem_free(imc_data, sizeof (imc_t)); + imc_data = NULL; +} + +static void +imc_alloc(void) +{ + imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP); + + mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL); + avl_create(&imc_data->imc_stubs, imc_stub_comparator, + sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link)); +} + +static struct cb_ops imc_cb_ops = { + .cb_open = imc_open, + .cb_close = imc_close, + .cb_strategy = nodev, + .cb_print = nodev, + .cb_dump = nodev, + .cb_read = nodev, + .cb_write = nodev, + .cb_ioctl = imc_ioctl, + .cb_devmap = nodev, + .cb_mmap = nodev, + .cb_segmap = nodev, + .cb_chpoll = nochpoll, + .cb_prop_op = ddi_prop_op, + .cb_flag = D_MP, + .cb_rev = CB_REV, + .cb_aread = nodev, + .cb_awrite = nodev +}; + +static struct dev_ops imc_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, + .devo_getinfo = imc_getinfo, + .devo_identify = nulldev, + .devo_probe = nulldev, + .devo_attach = imc_attach, + .devo_detach = imc_detach, + .devo_reset = nodev, + .devo_cb_ops = &imc_cb_ops, + .devo_quiesce = ddi_quiesce_not_needed +}; + +static struct modldrv imc_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "Intel Integrated Memory Controller Driver", + .drv_dev_ops = &imc_dev_ops +}; + +static struct modlinkage imc_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &imc_modldrv, NULL } +}; + +int +_init(void) +{ + int ret; + + if ((ret = mod_install(&imc_modlinkage)) == 0) { + imc_alloc(); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&imc_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + if ((ret = mod_remove(&imc_modlinkage)) == 0) { + imc_free(); + } + return (ret); +} diff --git a/usr/src/uts/i86pc/io/imc/imc.conf b/usr/src/uts/i86pc/io/imc/imc.conf new file mode 100644 index 0000000000..7f55dc2cae --- /dev/null +++ b/usr/src/uts/i86pc/io/imc/imc.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +name="imc" parent="pseudo" instance=0; diff --git a/usr/src/uts/i86pc/io/imc/imc.h b/usr/src/uts/i86pc/io/imc/imc.h new file mode 100644 index 0000000000..7d07be20af --- /dev/null +++ b/usr/src/uts/i86pc/io/imc/imc.h @@ -0,0 +1,940 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _INTEL_IMC_H +#define _INTEL_IMC_H + +#include <sys/types.h> +#include <sys/bitmap.h> +#include <sys/list.h> +#include <sys/sunddi.h> + +/* + * This header file contains the definitions used for the various generations of + * the Intel IMC driver. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The maximum number of sockets that the IMC driver supports. This is currently + * determined by the Purley platforms (Skylake) which support up to 8 CPUs. + */ +#define IMC_MAX_SOCKETS 8 + +/* + * The maximum number of memory controllers that exist per socket. Currently all + * supported platforms (Sandy Bridge -> Skylake) support at most two. + */ +#define IMC_MAX_IMCPERSOCK 2 + +/* + * The maximum number of channels that exist per IMC. Currently Skylake supports + * 3 per IMC. On certain configurations of Haswell/Broadwell, there is only a + * single IMC which supports all 4 channels. + */ +#define IMC_MAX_CHANPERMC 4 + +/* + * The maximum number of DIMMs that exist per channel. On Skylake this is two + * DIMMs. However, Sandy Bridge through Broadwell support three. + */ +#define IMC_MAX_DIMMPERCHAN 3 + +/* + * The maximum number of rank disable bits per DIMM. This is currently + * consistent across all generations that have these bits. + */ +#define IMC_MAX_RANK_DISABLE 4 + +/* + * The number of different PCI buses that we need to record for a given + * platform. Pre-Skylake there are only two that are required, one for the IIO + * and one for the non-IIO. On Skylake, more PCI buses are used. + */ +#define IMC_MAX_PCIBUSES 3 + +/* + * Macros to take apart the node id for a given processor. These assume that + * we're reading the nodeid from the UBox and not from the SAD control. + */ +#define IMC_NODEID_UBOX_MASK(x) ((x) & 0x7) + +/* + * On Ivy Bridge through Broadwell, the node id that is found in the SAD targets + * has the HA indicator as NodeID[2]. This means that the actual target node of + * the socket is NodeID[3] | NodeID[1:0]. + */ +#define IMC_NODEID_IVY_BRD_UPPER(x) BITX(x, 3, 3) +#define IMC_NODEID_IVY_BRD_LOWER(x) BITX(x, 1, 0) +#define IMC_NODEID_IVY_BRD_HA(x) BITX(x, 2, 2) + +/* + * Macros to take apart the MCMTR register bits that we care about. + */ +#define IMC_MCMTR_CLOSED_PAGE(x) BITX(x, 0, 0) +#define IMC_MCMTR_LOCKSTEP(x) BITX(x, 1, 1) +#define IMC_MCMTR_ECC_ENABLED(x) BITX(x, 2, 2) + +#define IMC_MCMTR_DDR4_HAS_BRD(x) BITX(x, 14, 14) + +/* + * Macros to take apart the dimmmtr_* registers in different generations. While + * there are similarities, these often end up different between generations and + * chips. These macros use a range of CPUs that they're valid for in the name. + * Macros with no suffix are valid for all currently supported CPUs. + */ + +#define IMC_REG_MC_MTR0 0x80 +#define IMC_REG_MC_MTR1 0x84 +#define IMC_REG_MC_MTR2 0x88 + +#define IMC_MTR_CA_WIDTH(x) BITX(x, 1, 0) +#define IMC_MTR_CA_BASE 10 +#define IMC_MTR_CA_MIN 10 +#define IMC_MTR_CA_MAX 12 + +#define IMC_MTR_RA_WIDTH(x) BITX(x, 4, 2) +#define IMC_MTR_RA_BASE 12 +#define IMC_MTR_RA_MIN 13 +#define IMC_MTR_RA_MAX 18 + +#define IMC_MTR_DENSITY_IVY_BRD(x) BITX(x, 6, 5) +#define IMC_MTR_DENSITY_SKX(x) BITX(x, 7, 5) + +#define IMC_MTR_WIDTH_IVB_HAS(x) BITX(x, 8, 7) +#define IMC_MTR_WIDTH_BRD_SKX(x) BITX(x, 9, 8) + +#define IMC_MTR_DDR_RANKS(x) BITX(x, 13, 12) +#define IMC_MTR_DDR_RANKS_MAX 4 +#define IMC_MTR_DDR_RANKS_MAX_HAS_SKX 8 + +#define IMC_MTR_PRESENT_SNB_BRD(x) BITX(x, 14, 14) +#define IMC_MTR_PRESENT_SKYLAKE(x) BITX(x, 15, 15) + +#define IMC_MTR_RANK_DISABLE(x) BITX(x, 19, 16) + +#define IMC_MTR_DDR4_ENABLE_HAS_BRD(x) BITX(x, 20, 20) +#define IMC_MTR_HDRL_HAS_SKX(x) BITX(x, 21, 21) +#define IMC_MTR_HDRL_PARITY_HAS_SKX(x) BITX(x, 22, 22) +#define IMC_MTR_3DSRANKS_HAS_SKX(x) BITX(x, 24, 23) + +/* + * Data for the RASENABLES register. + */ +#define IMC_MC_MIRROR_SNB_BRD(x) BITX(x, 0, 0) + +/* + * The maximum number of SAD rules that exist on all supported platforms. + */ +#define IMC_MAX_SAD_RULES 24 + +/* + * The maximum number of targets that can be interleaved in a sad rule. + */ +#define IMC_MAX_SAD_INTERLEAVE 8 + +/* + * The maximum number of route entries that exist in SAD. This is only used on + * SKX. + */ +#define IMC_MAX_SAD_MCROUTES 6 + +/* + * Definitions used to decode the MC Route table. Note that at this time this is + * very Skylake specific (as it's the only platform it's supported on). + */ +#define IMC_REG_SKX_SAD_MC_ROUTE_TABLE 0xb4 +#define IMC_MC_ROUTE_RING_BITS 3 +#define IMC_MC_ROUTE_RING_MASK 0x7 +#define IMC_MC_ROUTE_CHAN_BITS 2 +#define IMC_MC_ROUTE_CHAN_MASK 0x3 +#define IMC_MC_ROUTE_CHAN_OFFSET 18 + +/* + * Definitions to help decode TOLM (top of low memory) and TOHM (top of high + * memory). The way this is done varies based on generation. These regions are + * currently always 64-MByte aligned + * + * On Sandy Bridge and Ivy Bridge the low four bits of TOLM are bits 31:28. TOHM + * is a single register. Bits 20:0 map to bits 45:25. Both registers represent + * the upper limit (as in one higher than the max DRAM value). + * + * On Haswell through Skylake, TOLM is represented as a 32-bit quantity. No + * shifting is required. However, only bits 31:26 are present. TOHM is spread + * out among two registers. The lower 32-bits is masked in a similar fashion. In + * both cases, these registers represent an inclusive range where we don't care + * about other bits. To deal with this we'll increment the lowest bit we care + * about to make it an exclusive range. + * + * Based on the above, we have opted to make both ranges in the IMC driver + * normalized to an _exclusive_ value. + * + * Ivy Bridge has the values in both the CBo SAD and a VT-d section; however, we + * use the CBo SAD which is why it looks like Sandy Bridge and not Haswell. + */ + +#define IMC_TOLM_SNB_IVY_MASK 0xf +#define IMC_TOLM_SNB_IVY_SHIFT 28 +#define IMC_TOHM_SNB_IVY_MASK 0x1fffff +#define IMC_TOHM_SNB_IVY_SHIFT 25 + +#define IMC_TOLM_HAS_SKX_MASK 0xfc000000 +#define IMC_TOLM_HAS_SKY_EXCL (1 << 26) +#define IMC_TOHM_LOW_HAS_SKX_MASK 0xfc000000 +#define IMC_TOHM_HAS_SKY_EXCL (1 << 26) + +/* + * Definitions to decode SAD values. These are sometimes subtlety different + * across generations. + */ +#define IMC_SAD_DRAM_RULE_ENABLE(x) BITX(x, 0, 0) + +#define IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(x) BITX(x, 1, 1) +#define IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR 0 +#define IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6 1 + +#define IMC_SAD_DRAM_INTERLEAVE_SKX(x) BITX(x, 2, 1) +#define IMC_SAD_DRAM_INTERLEAVE_SKX_8t6 0 +#define IMC_SAD_DRAM_INTERLEAVE_SKX_10t8 1 +#define IMC_SAD_DRAM_INTERLEAVE_SKX_14t12 2 +#define IMC_SAD_DRAM_INTERLEAVE_SKX_32t30 3 + +#define IMC_SAD_DRAM_ATTR_SNB_BRD(x) BITX(x, 3, 2) +#define IMC_SAD_DRAM_ATTR_SKX(x) BITX(x, 4, 3) +#define IMC_SAD_DRAM_ATTR_DRAM 0 +#define IMC_SAD_DRAM_ATTR_MMCFG 1 +#define IMC_SAD_DRAM_ATTR_NXM 2 + +#define IMC_SAD_DRAM_MOD23_SKX(x) BITX(x, 6, 5) +#define IMC_SAD_DRAM_MOD23_MOD3 0 +#define IMC_SAD_DRAM_MOD23_MOD2_C01 1 +#define IMC_SAD_DRAM_MOD23_MOD2_C12 2 +#define IMC_SAD_DRAM_MOD23_MOD2_C02 3 + +#define IMC_SAD_DRAM_LIMIT_SNB_BRD(x) BITX(x, 25, 6) +#define IMC_SAD_DRAM_LIMIT_SKX(x) BITX(x, 26, 7) +#define IMC_SAD_DRAM_LIMIT_SHIFT 26 +#define IMC_SAD_DRAM_LIMIT_EXCLUSIVE (1 << IMC_SAD_DRAM_LIMIT_SHIFT) + +#define IMC_SAD_DRAM_A7_IVB_BRD(x) BITX(x, 26, 26) +#define IMC_SAD_DRAM_MOD3_SKX(x) BITX(x, 27, 27) +#define IMC_SAD_DRAM_MOD3_MODE_SKX(x) BITX(x, 31, 30) +#define IMC_SAD_DRAM_MOD3_MODE_45t6 0 +#define IMC_SAD_DRAM_MOD3_MODE_45t8 1 +#define IMC_SAD_DRAM_MOD3_MODE_45t12 2 + +#define IMC_SAD_ILEAVE_SNB_MASK 0x7 +#define IMC_SAD_ILEAVE_SNB_LEN 3 +#define IMC_SAD_ILEAVE_IVB_SKX_MASK 0xf +#define IMC_SAD_ILEAVE_IVB_SKX_LEN 4 + +/* + * The interleave targets on Skylake use the upper bit to indicate whether it is + * referring to a local memory controller or if it actually refers to another + * node that is far away. The maximum value includes the upper bit which is used + * to indicate whether it is remote or far. + */ +#define IMC_SAD_ILEAVE_SKX_LOCAL(x) BITX(x, 3, 3) +#define IMC_SAD_ILEAVE_SKX_TARGET(x) BITX(x, 2, 0) +#define IMC_SAD_ILEAVE_SKX_MAX 0xf + +/* + * Maximum number of TAD tables that we need to consider. On Sandy Bridge + * through Broadwell this is based on the number of home agents that are present + * in the system. On Sandy Bridge there is one, on others, there are up to two. + * On Skylake, there is one TAD per IMC. + */ +#define IMC_MAX_TAD 2 + +/* + * Maximum number of TAD rules on any of the supported processors. + */ +#define IMC_MAX_TAD_RULES 12 + +/* + * Maximum number of interleave targets. Note, this only applies to Sandy Bridge + * through Broadwell. Skylake gets this information in another form. + */ +#define IMC_MAX_TAD_TARGETS 4 + +/* + * Offset between the base TAD rule and the corresponding wayness rule on + * Skylake. + */ +#define IMC_SKX_WAYNESS_OFFSET 0x30 + +/* + * Various macros to decode the TAD rules. + */ +#define IMC_TAD_LIMIT(x) BITX(x, 31, 12) +#define IMC_TAD_LIMIT_SHIFT 26 +#define IMC_TAD_LIMIT_EXCLUSIVE (1 << IMC_TAD_LIMIT_SHIFT) + +#define IMC_TAD_SOCK_WAY(x) BITX(x, 11, 10) +#define IMC_TAD_SOCK_WAY_1 0 +#define IMC_TAD_SOCK_WAY_2 1 +#define IMC_TAD_SOCK_WAY_4 2 +#define IMC_TAD_SOCK_WAY_8 3 +#define IMC_TAD_CHAN_WAY(x) BITX(x, 9, 8) +#define IMC_TAD_TARG3(x) BITX(x, 7, 6) +#define IMC_TAD_TARG2(x) BITX(x, 5, 4) +#define IMC_TAD_TARG1(x) BITX(x, 3, 2) +#define IMC_TAD_TARG0(x) BITX(x, 1, 0) + +#define IMC_TAD_SNB_BRD_NTARGETS 4 + +/* + * These are registers specific to the Skylake and newer TAD BASE registers. + */ +#define IMC_TAD_BASE_BASE(x) BITX(x, 31, 12) +#define IMC_TAD_BASE_SHIFT 26 + +#define IMC_TAD_BASE_CHAN_GRAN(x) BITX(x, 7, 6) +#define IMC_TAD_BASE_CHAN_GRAN_64B 0 +#define IMC_TAD_BASE_CHAN_GRAN_256B 1 +#define IMC_TAD_BASE_CHAN_GRAN_4KB 2 + +#define IMC_TAD_BASE_SOCK_GRAN(x) BITX(x, 5, 4) +#define IMC_TAD_BASE_SOCK_GRAN_64B 0 +#define IMC_TAD_BASE_SOCK_GRAN_256B 1 +#define IMC_TAD_BASE_SOCK_GRAN_4KB 2 +#define IMC_TAD_BASE_SOCK_GRAN_1GB 3 + +#define IMC_TADCHAN_OFFSET_SNB_BRD(x) BITX(x, 25, 6) +#define IMC_TADCHAN_OFFSET_SKX(x) BITX(x, 23, 4) +#define IMC_TADCHAN_OFFSET_SHIFT 26 + +/* + * Macros to get at various TAD features. + */ +#define IMC_TAD_SYSDEF_LOCKSTEP(x) BITX(x, 7, 7) +#define IMC_TAD_SYSDEF2_SHIFTUP(x) BITX(x, 22, 22) +#define IMC_TAD_SYSDEF2_CHANHASH(x) BITX(x, 21, 21) + +/* + * Maximum number of different wayness entries that exist across the various IMC + * generations. Each wayness then has a maximum number of target entries. + */ +#define IMC_MAX_RANK_WAYS 5 +#define IMC_MAX_RANK_INTERLEAVES 8 + +/* + * Macros to take apart the rank interleave wayness and offset registers. + */ +#define IMC_RIR_WAYNESS_ENABLED(x) BITX(x, 31, 31) +#define IMC_RIR_WAYNESS_WAY(x) BITX(x, 29, 28) +#define IMC_RIR_LIMIT_HAS_SKX(x) BITX(x, 11, 1) +#define IMC_RIR_LIMIT_SNB_IVB(x) BITX(x, 10, 1) +#define IMC_RIR_LIMIT_SHIFT 29 +#define IMC_RIR_LIMIT_EXCLUSIVE (1 << IMC_RIR_LIMIT_SHIFT) + +/* + * Currently, everything other than Broadwell has the same value for the target + * offset. + */ +#define IMC_RIR_OFFSET_TARGET_BRD(x) BITX(x, 23, 20) +#define IMC_RIR_OFFSET_TARGET(x) BITX(x, 19, 16) +#define IMC_RIR_OFFSET_OFFSET_HAS_SKX(x) BITX(x, 15, 2) +#define IMC_RIR_OFFSET_OFFSET_SNB_IVB(x) BITX(x, 14, 2) +#define IMC_RIR_OFFSET_SHIFT 29 + +/* + * Definitions to cover manipulations of open and closed pages. + */ +#define IMC_PAGE_BITS_CLOSED 6 +#define IMC_PAGE_BITS_OPEN 13 + +/* + * Macros to decode and understand the CPUBUSNO registers in the UBOX_DECS. + */ +#define IMC_UBOX_CPUBUSNO_0(x) BITX(x, 7, 0) +#define IMC_UBOX_CPUBUSNO_1(x) BITX(x, 15, 8) +#define IMC_UBOX_CPUBUSNO_2(x) BITX(x, 23, 16) + +/* + * Hardware generations supported by the IMC driver. + */ +typedef enum { + IMC_GEN_UNKNOWN = 0, + IMC_GEN_SANDY, + IMC_GEN_IVY, + IMC_GEN_HASWELL, + IMC_GEN_BROADWELL, + /* + * IMC_GEN_SKYLAKE also covers Cascade Lake. The two are similar to the + * point of even having the same PCI IDs for all of the devices. The + * only difference in the cpuid signature between them is the stepping, + * hence we do not have a separate Cascade Lake target here, as it's + * really the same as Skylake. + */ + IMC_GEN_SKYLAKE +} imc_gen_t; + +/* + * Generation specific limits. + */ +typedef struct imc_gen_data { + uint_t igd_max_sockets; + uint_t igd_max_imcs; + uint_t igd_max_channels; + uint_t igd_max_dimms; + uint_t igd_max_ranks; + uint_t igd_mtr_offsets[IMC_MAX_DIMMPERCHAN]; + uint_t igd_mcmtr_offset; + uint_t igd_topo_offset; + uint_t igd_num_mcroutes; + uint_t igd_tolm_offset; + uint_t igd_tohm_low_offset; + uint_t igd_tohm_hi_offset; + uint_t igd_sad_dram_offset; + uint_t igd_sad_ndram_rules; + uint_t igd_sad_nodeid_offset; + uint_t igd_tad_nrules; + uint_t igd_tad_rule_offset; + uint_t igd_tad_chan_offset; + uint_t igd_tad_sysdef; + uint_t igd_tad_sysdef2; + uint_t igd_mc_mirror; + uint_t igd_rir_nways; + uint_t igd_rir_way_offset; + uint_t igd_rir_nileaves; + uint_t igd_rir_ileave_offset; + uint_t igd_ubox_cpubusno_offset; +} imc_gen_data_t; + +/* + * Different types of PCI devices that show up on the core that we may need to + * attach to. + */ +typedef enum { + IMC_TYPE_UNKNOWN = 0, + IMC_TYPE_MC0_M2M, /* SKX Only */ + IMC_TYPE_MC1_M2M, /* SKX Only */ + IMC_TYPE_MC0_MAIN0, + IMC_TYPE_MC0_MAIN1, + IMC_TYPE_MC1_MAIN0, + IMC_TYPE_MC1_MAIN1, + IMC_TYPE_MC0_CHANNEL0, + IMC_TYPE_MC0_CHANNEL1, + IMC_TYPE_MC0_CHANNEL2, + IMC_TYPE_MC0_CHANNEL3, + IMC_TYPE_MC1_CHANNEL0, + IMC_TYPE_MC1_CHANNEL1, + IMC_TYPE_MC1_CHANNEL2, + IMC_TYPE_MC1_CHANNEL3, + IMC_TYPE_SAD_DRAM, + IMC_TYPE_SAD_MMIO, + /* + * We want to note which device has the TOLM and TOHM registers. + * Unfortunately this is a rather complicated affair. On Sandy Bridge + * they are a part of the IMC_TYPE_SAD_MMIO. On Ivy Bridge, it's on its + * own dedicated device on the CBo. + * + * On Haswell onward, these move to the VT-D misc. registers. On Haswell + * and Broadwell, only one of these exist in the system. However, on + * Skylake these exist per socket. + */ + IMC_TYPE_SAD_MISC, + IMC_TYPE_VTD_MISC, + /* + * On SKX this exists on a per-core basis. It contains the memory + * controller routing table. + */ + IMC_TYPE_SAD_MCROUTE, + IMC_TYPE_UBOX, + IMC_TYPE_UBOX_CPUBUSNO, + IMC_TYPE_HA0, + IMC_TYPE_HA1, +} imc_type_t; + +/* + * Each entry in the stub table represents a device that we might attach to in a + * given generation. This is only defined in the kernel to make it easier to + * build the imc decoder in userland for testing. + */ +#ifdef _KERNEL +typedef struct imc_stub_table { + imc_gen_t imcs_gen; + imc_type_t imcs_type; + uint16_t imcs_devid; + uint16_t imcs_pcidev; + uint16_t imcs_pcifunc; + const char *imcs_desc; +} imc_stub_table_t; + +typedef struct imc_stub { + avl_node_t istub_link; + dev_info_t *istub_dip; + uint16_t istub_vid; + uint16_t istub_did; + uint16_t istub_bus; + uint16_t istub_dev; + uint16_t istub_func; + ddi_acc_handle_t istub_cfgspace; + const imc_stub_table_t *istub_table; +} imc_stub_t; +#else +typedef struct imc_stub { + void *istub_unused; +} imc_stub_t; +#endif /* _KERNEL */ + +typedef enum { + IMC_F_UNSUP_PLATFORM = (1 << 0), + IMC_F_SCAN_DISPATCHED = (1 << 1), + IMC_F_SCAN_COMPLETE = (1 << 2), + IMC_F_ATTACH_DISPATCHED = (1 << 3), + IMC_F_ATTACH_COMPLETE = (1 << 4), + IMC_F_MCREG_FAILED = (1 << 5) +} imc_flags_t; + +#define IMC_F_ALL_FLAGS (IMC_F_UNSUP_PLATFORM | IMC_F_SCAN_DISPATCHED | \ + IMC_F_SCAN_COMPLETE | IMC_F_ATTACH_DISPATCHED | IMC_F_ATTACH_COMPLETE | \ + IMC_F_MCREG_FAILED) + +typedef enum imc_dimm_type { + IMC_DIMM_UNKNOWN, + IMC_DIMM_DDR3, + IMC_DIMM_DDR4, + IMC_DIMM_NVDIMM +} imc_dimm_type_t; + +typedef enum imc_dimm_valid { + IMC_DIMM_V_VALID = 0, + IMC_DIMM_V_BAD_PCI_READ = (1 << 0), + IMC_DIMM_V_BAD_ROWS = (1 << 1), + IMC_DIMM_V_BAD_COLUMNS = (1 << 2), + IMC_DIMM_V_BAD_DENSITY = (1 << 3), + IMC_DIMM_V_BAD_WIDTH = (1 << 4), + IMC_DIMM_V_BAD_RANKS = (1 << 5) +} imc_dimm_valid_t; + +typedef struct imc_dimm { + imc_dimm_valid_t idimm_valid; + boolean_t idimm_present; + uint8_t idimm_3dsranks; + boolean_t idimm_hdrl_parity; + boolean_t idimm_hdrl; + boolean_t idimm_ranks_disabled[IMC_MAX_RANK_DISABLE]; + uint8_t idimm_nbanks; + uint8_t idimm_nranks; + uint8_t idimm_width; + uint8_t idimm_density; /* In GiB */ + uint8_t idimm_nrows; + uint8_t idimm_ncolumns; + /* Synthesized */ + uint64_t idimm_size; + /* Raw data */ + uint32_t idimm_mtr; +} imc_dimm_t; + +typedef struct imc_rank_ileave_entry { + uint8_t irle_target; + uint64_t irle_offset; +} imc_rank_ileave_entry_t; + +typedef struct imc_rank_ileave { + boolean_t irle_enabled; + uint32_t irle_raw; + uint8_t irle_nways; + uint8_t irle_nwaysbits; + uint64_t irle_limit; + uint_t irle_nentries; + imc_rank_ileave_entry_t irle_entries[IMC_MAX_RANK_INTERLEAVES]; +} imc_rank_ileave_t; + +typedef enum imc_channel_valid { + IMC_CHANNEL_V_VALID = 0, + IMC_CHANNEL_V_BAD_PCI_READ = 1 << 0, +} imc_channel_valid_t; + +typedef struct imc_channel { + imc_channel_valid_t ich_valid; + imc_stub_t *ich_desc; + uint_t ich_ndimms; + imc_dimm_t ich_dimms[IMC_MAX_DIMMPERCHAN]; + uint_t ich_ntad_offsets; + uint32_t ich_tad_offsets_raw[IMC_MAX_TAD_RULES]; + uint64_t ich_tad_offsets[IMC_MAX_TAD_RULES]; + uint_t ich_nrankileaves; + imc_rank_ileave_t ich_rankileaves[IMC_MAX_RANK_WAYS]; +} imc_channel_t; + +typedef struct imc_controller { + imc_stub_t *icn_main0; + imc_stub_t *icn_main1; + imc_stub_t *icn_m2m; + boolean_t icn_invalid; + imc_dimm_type_t icn_dimm_type; + boolean_t icn_ecc; + boolean_t icn_lockstep; + boolean_t icn_closed; + uint32_t icn_topo; + uint_t icn_nchannels; + imc_channel_t icn_channels[IMC_MAX_CHANPERMC]; +} imc_mc_t; + +typedef enum imc_sad_rule_type { + IMC_SAD_TYPE_DRAM, + IMC_SAD_TYPE_MMCFG, + IMC_SAD_TYPE_NXM +} imc_sad_rule_type_t; + +typedef enum imc_sad_rule_imode { + IMC_SAD_IMODE_8t6, + IMC_SAD_IMODE_8t6XOR, + IMC_SAD_IMODE_10t8, + IMC_SAD_IMODE_14t12, + IMC_SAD_IMODE_32t30 +} imc_sad_rule_imode_t; + +typedef enum imc_sad_rule_mod_mode { + IMC_SAD_MOD_MODE_NONE, + IMC_SAD_MOD_MODE_45t6, + IMC_SAD_MOD_MODE_45t8, + IMC_SAD_MOD_MODE_45t12 +} imc_sad_rule_mod_mode_t; + +typedef enum imc_sad_rule_mod_type { + IMC_SAD_MOD_TYPE_NONE, + IMC_SAD_MOD_TYPE_MOD3, + IMC_SAD_MOD_TYPE_MOD2_01, + IMC_SAD_MOD_TYPE_MOD2_12, + IMC_SAD_MOD_TYPE_MOD2_02 +} imc_sad_rule_mod_type_t; + +typedef struct imc_sad_mcroute_entry { + uint8_t ismce_imc; /* ID of the target IMC */ + uint8_t ismce_pchannel; /* ID of the target physical channel */ +} imc_sad_mcroute_entry_t; + +typedef struct imc_sad_mcroute_table { + uint32_t ismc_raw_mcroute; + uint_t ismc_nroutes; + imc_sad_mcroute_entry_t ismc_mcroutes[IMC_MAX_SAD_MCROUTES]; +} imc_sad_mcroute_table_t; + +/* + * This rule represents a single SAD entry. + */ +typedef struct imc_sad_rule { + uint32_t isr_raw_dram; + uint32_t isr_raw_interleave; + boolean_t isr_enable; + boolean_t isr_a7mode; + boolean_t isr_need_mod3; + uint64_t isr_limit; + imc_sad_rule_type_t isr_type; + imc_sad_rule_imode_t isr_imode; + imc_sad_rule_mod_mode_t isr_mod_mode; + imc_sad_rule_mod_type_t isr_mod_type; + uint_t isr_ntargets; + uint8_t isr_targets[IMC_MAX_SAD_INTERLEAVE]; +} imc_sad_rule_t; + +typedef enum imc_sad_flags { + IMC_SAD_MCROUTE_VALID = 1 << 0, +} imc_sad_flags_t; + +typedef enum imc_sad_valid { + IMC_SAD_V_VALID = 0, + IMC_SAD_V_BAD_PCI_READ = 1 << 0, + IMC_SAD_V_BAD_MCROUTE = 1 << 1, + IMC_SAD_V_BAD_DRAM_ATTR = 1 << 2, + IMC_SAD_V_BAD_MOD3 = 1 << 3, +} imc_sad_valid_t; + +typedef struct imc_sad { + imc_sad_flags_t isad_flags; + imc_sad_valid_t isad_valid; + imc_stub_t *isad_dram; + imc_stub_t *isad_mmio; + imc_stub_t *isad_tolh; + uint64_t isad_tolm; + uint64_t isad_tohm; + uint_t isad_nrules; + imc_sad_rule_t isad_rules[IMC_MAX_SAD_RULES]; + imc_sad_mcroute_table_t isad_mcroute; +} imc_sad_t; + +typedef enum imc_tad_gran { + IMC_TAD_GRAN_64B = 0, + IMC_TAD_GRAN_256B, + IMC_TAD_GRAN_4KB, + IMC_TAD_GRAN_1GB +} imc_tad_gran_t; + +typedef struct imc_tad_rule { + uint64_t itr_base; + uint64_t itr_limit; + uint32_t itr_raw; + uint32_t itr_raw_gran; + uint8_t itr_sock_way; + uint8_t itr_chan_way; + imc_tad_gran_t itr_sock_gran; + imc_tad_gran_t itr_chan_gran; + uint_t itr_ntargets; + uint8_t itr_targets[IMC_MAX_TAD_TARGETS]; +} imc_tad_rule_t; + +typedef enum imc_tad_valid { + IMC_TAD_V_VALID = 1 << 0, + IMC_TAD_V_BAD_PCI_READ = 1 << 1, + IMC_TAD_V_BAD_CHAN_GRAN = 1 << 2 +} imc_tad_valid_t; + +typedef enum imc_tad_flags { + IMC_TAD_FLAG_CHANSHIFT = 1 << 0, + IMC_TAD_FLAG_CHANHASH = 1 << 1, + IMC_TAD_FLAG_MIRROR = 1 << 2, + IMC_TAD_FLAG_LOCKSTEP = 1 << 3 +} imc_tad_flags_t; + +typedef struct imc_tad { + imc_tad_valid_t itad_valid; + imc_stub_t *itad_stub; + imc_tad_flags_t itad_flags; + uint_t itad_nrules; + imc_tad_rule_t itad_rules[IMC_MAX_TAD_RULES]; +} imc_tad_t; + +typedef enum imc_socket_valid { + IMC_SOCKET_V_VALID = 0, + IMC_SOCKET_V_BAD_NODEID = 1 << 0 +} imc_socket_valid_t; + +typedef struct imc_socket { + imc_socket_valid_t isock_valid; + uint_t isock_bus[IMC_MAX_PCIBUSES]; + uint_t isock_nbus; + uint_t isock_gen; + nvlist_t *isock_nvl; + char *isock_buf; + size_t isock_buflen; + imc_sad_t isock_sad; + uint_t isock_ntad; + imc_tad_t isock_tad[IMC_MAX_TAD]; + imc_stub_t *isock_ubox; + imc_stub_t *isock_cpubusno; + uint32_t isock_nodeid; + uint_t isock_nimc; + imc_mc_t isock_imcs[IMC_MAX_IMCPERSOCK]; +} imc_socket_t; + +typedef struct imc { + /* + * The initial members here are only used in the kernel. This is done to + * make it easier for us to be able to define a version of this to use + * in testing. + */ +#ifdef _KERNEL + dev_info_t *imc_dip; + kmutex_t imc_lock; + imc_flags_t imc_flags; + const imc_gen_data_t *imc_gen_data; + ddi_taskq_t *imc_taskq; + uint_t imc_nscanned; + avl_tree_t imc_stubs; + nvlist_t *imc_decoder_dump; + char *imc_decoder_buf; + size_t imc_decoder_len; +#endif /* _KERNEL */ + imc_gen_t imc_gen; + + /* + * Data about the memory in the system + */ + uint_t imc_nsockets; + imc_socket_t imc_sockets[IMC_MAX_SOCKETS]; + +#ifdef _KERNEL + /* + * The imc_sockets[] array is organized based on increasing PCI Bus ID. + * This array maps the socket id that user land thinks of back to the + * actual underlying socket in case hardware does not put them in order. + */ + imc_socket_t *imc_spointers[IMC_MAX_SOCKETS]; + + /* + * Store the IIO global VT-D misc. device. While there are sometimes + * multiple on the system, we only keep a single one around. + */ + imc_stub_t *imc_gvtd_misc; +#endif +} imc_t; + + +/* + * Decoder failure reasons + */ +typedef enum imc_decode_failure { + IMC_DECODE_F_NONE = 0, + /* + * Indicates that the memory address fell into a reserved legacy range. + * The legacy range index is stored in the failure data. + */ + IMC_DECODE_F_LEGACY_RANGE, + /* + * Indicates that we had bad socket data. The socket in question is + * noted in the failure data. + */ + IMC_DECODE_F_BAD_SOCKET, + /* + * Indicates that we had bad SAD data. The socket the SAD is associated + * with is noted in the failure data. + */ + IMC_DECODE_F_BAD_SAD, + /* + * Indicates that the address was not contained in conventional, low, + * or high memory. + */ + IMC_DECODE_F_OUTSIDE_DRAM, + /* + * Indicates that no valid SAD rule was found for the address. + */ + IMC_DECODE_F_NO_SAD_RULE, + /* + * Indicates that the SAD interleave target was beyond the valid index. + */ + IMC_DECODE_F_BAD_SAD_INTERLEAVE, + /* + * Indicates that the route suggested a remote processor we can't find. + */ + IMC_DECODE_F_BAD_REMOTE_MC_ROUTE, + /* + * Indicates that we ended up in a loop trying to find the right socket + * to use. + */ + IMC_DECODE_F_SAD_SEARCH_LOOP, + /* + * Indicates that we encountered a SAD rule that asked for inconsistent + * mod rules. + */ + IMC_DECODE_F_SAD_BAD_MOD, + /* + * Indicates that the socket or tad rule we found doesn't actually point + * to something that we know about. + */ + IMC_DECODE_F_SAD_BAD_SOCKET, + IMC_DECODE_F_SAD_BAD_TAD, + /* + * Indicates that we could not find a matching tad rule. + */ + IMC_DECODE_F_NO_TAD_RULE, + /* + * Indicates that we encountered the TAD channel 3-way interleave that + * we don't support. + */ + IMC_DECODE_F_TAD_3_ILEAVE, + /* + * Indicates that we had a bad target index. + */ + IMC_DECODE_F_TAD_BAD_TARGET_INDEX, + /* + * Indicates that we have a bad channel ID. + */ + IMC_DECODE_F_BAD_CHANNEL_ID, + /* + * Indicates that the TAD rule offset in the channel interleave was + * incorrect. + */ + IMC_DECODE_F_BAD_CHANNEL_TAD_OFFSET, + /* + * We couldn't find a valid rank interleave rule. + */ + IMC_DECODE_F_NO_RIR_RULE, + /* + * Indicates that the index of the rank interleaving target was bad. + */ + IMC_DECODE_F_BAD_RIR_ILEAVE_TARGET, + /* + * Indicates that the calculated DIMM represents an invalid DIMM that is + * beyond the number of supported DIMMS per channel on the platform. + */ + IMC_DECODE_F_BAD_DIMM_INDEX, + /* + * Indicates that the specified DIMM is not preset; however, it is a + * valid DIMM number. + */ + IMC_DECODE_F_DIMM_NOT_PRESENT, + /* + * Indicates that the specified rank on the DIMM is more than the number + * of ranks that the DIMM has. + */ + IMC_DECODE_F_BAD_DIMM_RANK, + /* + * Indicates that the channel offset is larger than the system address, + * meaning that we would end up with an underflow if we continued. The + * equivalent is true for the rank address. + */ + IMC_DECODE_F_CHANOFF_UNDERFLOW, + IMC_DECODE_F_RANKOFF_UNDERFLOW, +} imc_decode_failure_t; + +/* + * Decoder state tracking + */ +typedef struct imc_decode_state { + imc_decode_failure_t ids_fail; + uint64_t ids_fail_data; + uint64_t ids_pa; + uint64_t ids_chanaddr; + uint64_t ids_rankaddr; + uint32_t ids_nodeid; + uint32_t ids_tadid; + uint32_t ids_channelid; + uint32_t ids_physrankid; + uint32_t ids_dimmid; + uint32_t ids_rankid; + const imc_socket_t *ids_socket; + const imc_sad_t *ids_sad; + const imc_sad_rule_t *ids_sad_rule; + const imc_tad_t *ids_tad; + const imc_tad_rule_t *ids_tad_rule; + const imc_mc_t *ids_mc; + const imc_channel_t *ids_chan; + const imc_rank_ileave_t *ids_rir; + const imc_dimm_t *ids_dimm; +} imc_decode_state_t; + +#ifdef _KERNEL + +/* + * Functions needed for the stub drivers. + */ +extern int imc_attach_stub(dev_info_t *, ddi_attach_cmd_t); +extern int imc_detach_stub(dev_info_t *, ddi_detach_cmd_t); + +/* + * Decoder related functions + */ +extern void imc_decoder_init(imc_t *); + +extern nvlist_t *imc_dump_decoder(imc_t *); +#else /* !_KERNEL */ +extern boolean_t imc_restore_decoder(nvlist_t *, imc_t *); +#endif /* _KERNEL */ + +extern boolean_t imc_decode_pa(const imc_t *, uint64_t, imc_decode_state_t *); + + +#ifdef __cplusplus +} +#endif + +#endif /* _INTEL_IMC_H */ diff --git a/usr/src/uts/i86pc/io/imc/imcstub.c b/usr/src/uts/i86pc/io/imc/imcstub.c new file mode 100644 index 0000000000..ee020dd5c4 --- /dev/null +++ b/usr/src/uts/i86pc/io/imc/imcstub.c @@ -0,0 +1,81 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * This is a stub driver that is used by the main imcstub driver to attach + * component PCI devices so that it can access their dev_info_t. + */ + +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/modctl.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#include "imc.h" + + +static int +imcstub_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + return (imc_attach_stub(dip, cmd)); +} + +static int +imcstub_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + return (imc_detach_stub(dip, cmd)); +} + +static struct dev_ops imcstub_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, + .devo_getinfo = nodev, + .devo_identify = nodev, + .devo_probe = nulldev, + .devo_attach = imcstub_attach, + .devo_detach = imcstub_detach, + .devo_reset = nodev, + .devo_quiesce = ddi_quiesce_not_needed +}; + +static struct modldrv imcstub_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "IMC Stub driver", + .drv_dev_ops = &imcstub_dev_ops +}; + +static struct modlinkage imcstub_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &imcstub_modldrv, NULL } +}; + +int +_init(void) +{ + return (mod_install(&imcstub_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&imcstub_modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&imcstub_modlinkage)); +} diff --git a/usr/src/uts/i86pc/io/mp_platform_common.c b/usr/src/uts/i86pc/io/mp_platform_common.c index 813bd1c42a..efb4c81092 100644 --- a/usr/src/uts/i86pc/io/mp_platform_common.c +++ b/usr/src/uts/i86pc/io/mp_platform_common.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 by Delphix. All rights reserved. * Copyright (c) 2019, Joyent, Inc. */ diff --git a/usr/src/uts/i86pc/io/pci/pci_common.h b/usr/src/uts/i86pc/io/pci/pci_common.h index 63fe4bb165..d5fa3bfd55 100644 --- a/usr/src/uts/i86pc/io/pci/pci_common.h +++ b/usr/src/uts/i86pc/io/pci/pci_common.h @@ -22,6 +22,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ #ifndef _PCI_PCI_COMMON_H @@ -33,7 +35,7 @@ extern "C" { /* * Common header file with definitions shared between - * pci(7d) and npe(7d) + * pci(7D) and npe(7D) */ /* State structure. */ @@ -45,12 +47,18 @@ typedef struct pci_state { kmutex_t pci_mutex; kmutex_t pci_peek_poke_mutex; kmutex_t pci_err_mutex; + + /* + * The following members are only used by npe(7D). + * See uts/i86pc/io/pciex/npe.c for more information. + */ + ndi_event_hdl_t pci_ndi_event_hdl; } pci_state_t; /* * These are the access routines. - * The pci_bus_map sets the handle to point to these in pci(7d). - * The npe_bus_map sets the handle to point to these in npe(7d). + * The pci_bus_map sets the handle to point to these in pci(7D). + * The npe_bus_map sets the handle to point to these in npe(7D). */ uint8_t pci_config_rd8(ddi_acc_impl_t *hdlp, uint8_t *addr); uint16_t pci_config_rd16(ddi_acc_impl_t *hdlp, uint16_t *addr); diff --git a/usr/src/uts/i86pc/io/pciex/npe.c b/usr/src/uts/i86pc/io/pciex/npe.c index 4ef393ddb0..fcb68164ee 100644 --- a/usr/src/uts/i86pc/io/pciex/npe.c +++ b/usr/src/uts/i86pc/io/pciex/npe.c @@ -26,11 +26,35 @@ /* * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* - * Host to PCI-Express local bus driver + * npe (Nexus PCIe driver): Host to PCI-Express local bus driver + * + * npe serves as the driver for PCIe Root Complexes and as the nexus driver + * for PCIe devices. See also: npe(7D). For more information about hotplug, + * see the big theory statement at uts/common/os/ddi_hp_impl.c. + * + * + * NDI EVENT HANDLING SUPPORT + * + * npe supports NDI event handling. The only available event is surprise + * removal of a device. Child drivers can register surprise removal event + * callbacks by requesting an event cookie using ddi_get_eventcookie for + * the DDI_DEVI_REMOVE_EVENT and add their callback using + * ddi_add_event_handler. For an example, see the nvme driver in + * uts/common/io/nvme/nvme.c. + * + * The NDI events in npe are retrieved using NDI_EVENT_NOPASS, which + * prevent them from being propagated up the tree once they reach the npe's + * bus_get_eventcookie operations. This is important because npe maintains + * the state of PCIe devices and their receptacles, via the PCIe hotplug + * controller driver (pciehpc). + * + * Hot removal events are ultimately posted by the PCIe hotplug controller + * interrupt handler for hotplug events. Events are posted using the + * ndi_post_event interface. */ #include <sys/conf.h> @@ -72,6 +96,15 @@ static int npe_intr_ops(dev_info_t *, dev_info_t *, ddi_intr_op_t, ddi_intr_handle_impl_t *, void *); static int npe_fm_init(dev_info_t *, dev_info_t *, int, ddi_iblock_cookie_t *); +static int npe_bus_get_eventcookie(dev_info_t *, dev_info_t *, char *, + ddi_eventcookie_t *); +static int npe_bus_add_eventcall(dev_info_t *, dev_info_t *, + ddi_eventcookie_t, void (*)(dev_info_t *, + ddi_eventcookie_t, void *, void *), + void *, ddi_callback_id_t *); +static int npe_bus_remove_eventcall(dev_info_t *, ddi_callback_id_t); +static int npe_bus_post_event(dev_info_t *, dev_info_t *, + ddi_eventcookie_t, void *); static int npe_fm_callback(dev_info_t *, ddi_fm_error_t *, const void *); @@ -102,10 +135,10 @@ struct bus_ops npe_bus_ops = { ddi_dma_mctl, npe_ctlops, ddi_bus_prop_op, - 0, /* (*bus_get_eventcookie)(); */ - 0, /* (*bus_add_eventcall)(); */ - 0, /* (*bus_remove_eventcall)(); */ - 0, /* (*bus_post_event)(); */ + npe_bus_get_eventcookie, + npe_bus_add_eventcall, + npe_bus_remove_eventcall, + npe_bus_post_event, 0, /* (*bus_intr_ctl)(); */ 0, /* (*bus_config)(); */ 0, /* (*bus_unconfig)(); */ @@ -271,12 +304,27 @@ npe_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) return (ret); } +/* + * See big theory statement at the top of this file for more information about + * surprise removal events. + */ +#define NPE_EVENT_TAG_HOT_REMOVAL 0 +static ndi_event_definition_t npe_ndi_event_defs[1] = { + {NPE_EVENT_TAG_HOT_REMOVAL, DDI_DEVI_REMOVE_EVENT, EPL_KERNEL, + NDI_EVENT_POST_TO_ALL} +}; + +static ndi_event_set_t npe_ndi_events = { + NDI_EVENTS_REV1, ARRAY_SIZE(npe_ndi_event_defs), npe_ndi_event_defs +}; + /*ARGSUSED*/ static int npe_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) { int instance = ddi_get_instance(devi); pci_state_t *pcip = NULL; + int ret; if (cmd == DDI_RESUME) { /* @@ -316,6 +364,22 @@ npe_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) if (pcie_init(devi, NULL) != DDI_SUCCESS) goto fail1; + ret = ndi_event_alloc_hdl(pcip->pci_dip, NULL, &pcip->pci_ndi_event_hdl, + NDI_SLEEP); + if (ret == NDI_SUCCESS) { + ret = ndi_event_bind_set(pcip->pci_ndi_event_hdl, + &npe_ndi_events, NDI_SLEEP); + if (ret != NDI_SUCCESS) { + dev_err(pcip->pci_dip, CE_WARN, "npe: failed to bind " + "NDI event set (error=%d)", ret); + goto fail1; + } + } else { + dev_err(pcip->pci_dip, CE_WARN, "npe: failed to allocate " + "event handle (error=%d)", ret); + goto fail1; + } + /* Second arg: initialize for pci_express root nexus */ if (pcitool_init(devi, B_TRUE) != DDI_SUCCESS) goto fail2; @@ -352,11 +416,36 @@ npe_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) { int instance = ddi_get_instance(devi); pci_state_t *pcip; + int ret; pcip = ddi_get_soft_state(npe_statep, ddi_get_instance(devi)); switch (cmd) { case DDI_DETACH: + + /* + * Clean up event handling first, to ensure there are no + * oustanding callbacks registered. + */ + ret = ndi_event_unbind_set(pcip->pci_ndi_event_hdl, + &npe_ndi_events, NDI_SLEEP); + if (ret == NDI_SUCCESS) { + /* ndi_event_free_hdl always succeeds. */ + (void) ndi_event_free_hdl(pcip->pci_ndi_event_hdl); + } else { + /* + * The event set will only fail to unbind if there are + * outstanding callbacks registered for it, which + * probably means a child driver still has one + * registered and thus was not cleaned up properly + * before npe's detach routine was called. Consequently, + * we should fail the detach here. + */ + dev_err(pcip->pci_dip, CE_WARN, "npe: failed to " + "unbind NDI event set (error=%d)", ret); + return (DDI_FAILURE); + } + pcie_fab_fini_bus(devi, PCIE_BUS_INITIAL); /* Uninitialize pcitool support. */ @@ -373,6 +462,7 @@ npe_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) ddi_fm_fini(devi); ddi_soft_state_free(npe_statep, instance); + return (DDI_SUCCESS); case DDI_SUSPEND: @@ -414,7 +504,7 @@ static int npe_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp, off_t offset, off_t len, caddr_t *vaddrp) { - int rnumber; + int rnumber; int space; ddi_acc_impl_t *ap; ddi_acc_hdl_t *hp; @@ -1111,6 +1201,49 @@ npe_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap, return (pcip->pci_fmcap); } +static int +npe_bus_get_eventcookie(dev_info_t *dip, dev_info_t *rdip, char *eventname, + ddi_eventcookie_t *cookiep) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + + return (ndi_event_retrieve_cookie(pcip->pci_ndi_event_hdl, rdip, + eventname, cookiep, NDI_EVENT_NOPASS)); +} + +static int +npe_bus_add_eventcall(dev_info_t *dip, dev_info_t *rdip, + ddi_eventcookie_t cookie, void (*callback)(dev_info_t *dip, + ddi_eventcookie_t cookie, void *arg, void *bus_impldata), + void *arg, ddi_callback_id_t *cb_id) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + + return (ndi_event_add_callback(pcip->pci_ndi_event_hdl, rdip, cookie, + callback, arg, NDI_SLEEP, cb_id)); +} + +static int +npe_bus_remove_eventcall(dev_info_t *dip, ddi_callback_id_t cb_id) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + return (ndi_event_remove_callback(pcip->pci_ndi_event_hdl, cb_id)); +} + +static int +npe_bus_post_event(dev_info_t *dip, dev_info_t *rdip, + ddi_eventcookie_t cookie, void *impl_data) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + return (ndi_event_do_callback(pcip->pci_ndi_event_hdl, rdip, cookie, + impl_data)); + +} + /*ARGSUSED*/ static int npe_fm_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *no_used) diff --git a/usr/src/uts/i86pc/io/psm/psm_common.c b/usr/src/uts/i86pc/io/psm/psm_common.c index b59d87bdcc..623c6e5617 100644 --- a/usr/src/uts/i86pc/io/psm/psm_common.c +++ b/usr/src/uts/i86pc/io/psm/psm_common.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #include <sys/types.h> diff --git a/usr/src/uts/i86pc/io/viona/viona.conf b/usr/src/uts/i86pc/io/viona/viona.conf new file mode 100644 index 0000000000..e66488531a --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona.conf @@ -0,0 +1,14 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# Copyright 2013 Pluribus Networks Inc. +# + +name="viona" parent="pseudo"; diff --git a/usr/src/uts/i86pc/io/viona/viona.mapfile b/usr/src/uts/i86pc/io/viona/viona.mapfile new file mode 100644 index 0000000000..cece86348c --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona.mapfile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + local: + *; +}; diff --git a/usr/src/uts/i86pc/io/viona/viona_hook.c b/usr/src/uts/i86pc/io/viona/viona_hook.c new file mode 100644 index 0000000000..4520be04b0 --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_hook.c @@ -0,0 +1,438 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/hook.h> +#include <sys/hook_event.h> + +#include "viona_impl.h" + + +/* + * Global linked list of viona_neti_ts. Access is protected by viona_neti_lock + */ +static list_t viona_neti_list; +static kmutex_t viona_neti_lock; + +/* + * viona_neti is allocated and initialized during attach, and read-only + * until detach (where it's also freed) + */ +static net_instance_t *viona_neti; + + +/* + * Generate a hook event for the packet in *mpp headed in the direction + * indicated by 'out'. If the packet is accepted, 0 is returned. If the + * packet is rejected, an error is returned. The hook function may or may not + * alter or even free *mpp. The caller is expected to deal with either + * situation. + */ +int +viona_hook(viona_link_t *link, viona_vring_t *ring, mblk_t **mpp, boolean_t out) +{ + viona_neti_t *nip = link->l_neti; + viona_nethook_t *vnh = &nip->vni_nethook; + hook_pkt_event_t info; + hook_event_t he; + hook_event_token_t het; + int ret; + + he = out ? vnh->vnh_event_out : vnh->vnh_event_in; + het = out ? vnh->vnh_token_out : vnh->vnh_token_in; + + if (!he.he_interested) + return (0); + + info.hpe_protocol = vnh->vnh_neti; + info.hpe_ifp = (phy_if_t)link; + info.hpe_ofp = (phy_if_t)link; + info.hpe_mp = mpp; + info.hpe_flags = 0; + + ret = hook_run(vnh->vnh_neti->netd_hooks, het, (hook_data_t)&info); + if (ret == 0) + return (0); + + if (out) { + VIONA_PROBE3(tx_hook_drop, viona_vring_t *, ring, + mblk_t *, *mpp, int, ret); + VIONA_RING_STAT_INCR(ring, tx_hookdrop); + } else { + VIONA_PROBE3(rx_hook_drop, viona_vring_t *, ring, + mblk_t *, *mpp, int, ret); + VIONA_RING_STAT_INCR(ring, rx_hookdrop); + } + return (ret); +} + +/* + * netinfo stubs - required by the nethook framework, but otherwise unused + * + * Currently, all ipf rules are applied against all interfaces in a given + * netstack (e.g. all interfaces in a zone). In the future if we want to + * support being able to apply different rules to different interfaces, I + * believe we would need to implement some of these stubs to map an interface + * name in a rule (e.g. 'net0', back to an index or viona_link_t); + */ +static int +viona_neti_getifname(net_handle_t neti __unused, phy_if_t phy __unused, + char *buf __unused, const size_t len __unused) +{ + return (-1); +} + +static int +viona_neti_getmtu(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused) +{ + return (-1); +} + +static int +viona_neti_getptmue(net_handle_t neti __unused) +{ + return (-1); +} + +static int +viona_neti_getlifaddr(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused, size_t nelem __unused, + net_ifaddr_t type[] __unused, void *storage __unused) +{ + return (-1); +} + +static int +viona_neti_getlifzone(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused, zoneid_t *zid __unused) +{ + return (-1); +} + +static int +viona_neti_getlifflags(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused, uint64_t *flags __unused) +{ + return (-1); +} + +static phy_if_t +viona_neti_phygetnext(net_handle_t neti __unused, phy_if_t phy __unused) +{ + return ((phy_if_t)-1); +} + +static phy_if_t +viona_neti_phylookup(net_handle_t neti __unused, const char *name __unused) +{ + return ((phy_if_t)-1); +} + +static lif_if_t +viona_neti_lifgetnext(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused) +{ + return (-1); +} + +static int +viona_neti_inject(net_handle_t neti __unused, inject_t style __unused, + net_inject_t *packet __unused) +{ + return (-1); +} + +static phy_if_t +viona_neti_route(net_handle_t neti __unused, struct sockaddr *address __unused, + struct sockaddr *next __unused) +{ + return ((phy_if_t)-1); +} + +static int +viona_neti_ispchksum(net_handle_t neti __unused, mblk_t *mp __unused) +{ + return (-1); +} + +static int +viona_neti_isvchksum(net_handle_t neti __unused, mblk_t *mp __unused) +{ + return (-1); +} + +static net_protocol_t viona_netinfo = { + NETINFO_VERSION, + NHF_VIONA, + viona_neti_getifname, + viona_neti_getmtu, + viona_neti_getptmue, + viona_neti_getlifaddr, + viona_neti_getlifzone, + viona_neti_getlifflags, + viona_neti_phygetnext, + viona_neti_phylookup, + viona_neti_lifgetnext, + viona_neti_inject, + viona_neti_route, + viona_neti_ispchksum, + viona_neti_isvchksum +}; + +/* + * Create/register our nethooks + */ +static int +viona_nethook_init(netid_t nid, viona_nethook_t *vnh, char *nh_name, + net_protocol_t *netip) +{ + int ret; + + if ((vnh->vnh_neti = net_protocol_register(nid, netip)) == NULL) { + cmn_err(CE_NOTE, "%s: net_protocol_register failed " + "(netid=%d name=%s)", __func__, nid, nh_name); + goto fail_init_proto; + } + + HOOK_FAMILY_INIT(&vnh->vnh_family, nh_name); + if ((ret = net_family_register(vnh->vnh_neti, &vnh->vnh_family)) != 0) { + cmn_err(CE_NOTE, "%s: net_family_register failed " + "(netid=%d name=%s err=%d)", __func__, + nid, nh_name, ret); + goto fail_init_family; + } + + HOOK_EVENT_INIT(&vnh->vnh_event_in, NH_PHYSICAL_IN); + if ((vnh->vnh_token_in = net_event_register(vnh->vnh_neti, + &vnh->vnh_event_in)) == NULL) { + cmn_err(CE_NOTE, "%s: net_event_register %s failed " + "(netid=%d name=%s)", __func__, NH_PHYSICAL_IN, nid, + nh_name); + goto fail_init_event_in; + } + + HOOK_EVENT_INIT(&vnh->vnh_event_out, NH_PHYSICAL_OUT); + if ((vnh->vnh_token_out = net_event_register(vnh->vnh_neti, + &vnh->vnh_event_out)) == NULL) { + cmn_err(CE_NOTE, "%s: net_event_register %s failed " + "(netid=%d name=%s)", __func__, NH_PHYSICAL_OUT, nid, + nh_name); + goto fail_init_event_out; + } + return (0); + + /* + * On failure, we undo all the steps that succeeded in the + * reverse order of initialization, starting at the last + * successful step (the labels denoting the failing step). + */ +fail_init_event_out: + VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in)); + VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in)); + vnh->vnh_token_in = NULL; + +fail_init_event_in: + VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family)); + VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family)); + +fail_init_family: + VERIFY0(net_protocol_unregister(vnh->vnh_neti)); + vnh->vnh_neti = NULL; + +fail_init_proto: + return (1); +} + +/* + * Shutdown the nethooks for a protocol family. This triggers notification + * callbacks to anything that has registered interest to allow hook consumers + * to unhook prior to the removal of the hooks as well as makes them unavailable + * to any future consumers as the first step of removal. + */ +static void +viona_nethook_shutdown(viona_nethook_t *vnh) +{ + VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_out)); + VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in)); + VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family)); +} + +/* + * Remove the nethooks for a protocol family. + */ +static void +viona_nethook_fini(viona_nethook_t *vnh) +{ + VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_out)); + VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in)); + VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family)); + VERIFY0(net_protocol_unregister(vnh->vnh_neti)); + vnh->vnh_neti = NULL; +} + +/* + * Callback invoked by the neti module. This creates/registers our hooks + * {IPv4,IPv6}{in,out} with the nethook framework so they are available to + * interested consumers (e.g. ipf). + * + * During attach, viona_neti_create is called once for every netstack + * present on the system at the time of attach. Thereafter, it is called + * during the creation of additional netstack instances (i.e. zone boot). As a + * result, the viona_neti_t that is created during this call always occurs + * prior to any viona instances that will use it to send hook events. + * + * It should never return NULL. If we cannot register our hooks, we do not + * set vnh_hooked of the respective protocol family, which will prevent the + * creation of any viona instances on this netstack (see viona_ioc_create). + * This can only occur if after a shutdown event (which means destruction is + * imminent) we are trying to create a new instance. + */ +static void * +viona_neti_create(const netid_t netid) +{ + viona_neti_t *nip; + + VERIFY(netid != -1); + + nip = kmem_zalloc(sizeof (*nip), KM_SLEEP); + nip->vni_netid = netid; + nip->vni_zid = net_getzoneidbynetid(netid); + mutex_init(&nip->vni_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&nip->vni_dev_list, sizeof (viona_soft_state_t), + offsetof(viona_soft_state_t, ss_node)); + + if (viona_nethook_init(netid, &nip->vni_nethook, Hn_VIONA, + &viona_netinfo) == 0) + nip->vni_nethook.vnh_hooked = B_TRUE; + + mutex_enter(&viona_neti_lock); + list_insert_tail(&viona_neti_list, nip); + mutex_exit(&viona_neti_lock); + + return (nip); +} + +/* + * Called during netstack teardown by the neti module. During teardown, all + * the shutdown callbacks are invoked, allowing consumers to release any holds + * and otherwise quiesce themselves prior to destruction, followed by the + * actual destruction callbacks. + */ +static void +viona_neti_shutdown(netid_t nid, void *arg) +{ + viona_neti_t *nip = arg; + + ASSERT(nip != NULL); + VERIFY(nid == nip->vni_netid); + + mutex_enter(&viona_neti_lock); + list_remove(&viona_neti_list, nip); + mutex_exit(&viona_neti_lock); + + if (nip->vni_nethook.vnh_hooked) + viona_nethook_shutdown(&nip->vni_nethook); +} + +/* + * Called during netstack teardown by the neti module. Destroys the viona + * netinst data. This is invoked after all the netstack and neti shutdown + * callbacks have been invoked. + */ +static void +viona_neti_destroy(netid_t nid, void *arg) +{ + viona_neti_t *nip = arg; + + ASSERT(nip != NULL); + VERIFY(nid == nip->vni_netid); + + mutex_enter(&nip->vni_lock); + while (nip->vni_ref != 0) + cv_wait(&nip->vni_ref_change, &nip->vni_lock); + mutex_exit(&nip->vni_lock); + + VERIFY(!list_link_active(&nip->vni_node)); + + if (nip->vni_nethook.vnh_hooked) + viona_nethook_fini(&nip->vni_nethook); + + mutex_destroy(&nip->vni_lock); + list_destroy(&nip->vni_dev_list); + kmem_free(nip, sizeof (*nip)); +} + +/* + * Find the viona netinst data by zone id. This is only used during + * viona instance creation (and thus is only called by a zone that is running). + */ +viona_neti_t * +viona_neti_lookup_by_zid(zoneid_t zid) +{ + viona_neti_t *nip; + + mutex_enter(&viona_neti_lock); + for (nip = list_head(&viona_neti_list); nip != NULL; + nip = list_next(&viona_neti_list, nip)) { + if (nip->vni_zid == zid) { + mutex_enter(&nip->vni_lock); + nip->vni_ref++; + mutex_exit(&nip->vni_lock); + mutex_exit(&viona_neti_lock); + return (nip); + } + } + mutex_exit(&viona_neti_lock); + return (NULL); +} + +void +viona_neti_rele(viona_neti_t *nip) +{ + mutex_enter(&nip->vni_lock); + VERIFY3S(nip->vni_ref, >, 0); + nip->vni_ref--; + mutex_exit(&nip->vni_lock); + cv_broadcast(&nip->vni_ref_change); +} + +void +viona_neti_attach(void) +{ + mutex_init(&viona_neti_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&viona_neti_list, sizeof (viona_neti_t), + offsetof(viona_neti_t, vni_node)); + + /* This can only fail if NETINFO_VERSION is wrong */ + viona_neti = net_instance_alloc(NETINFO_VERSION); + VERIFY(viona_neti != NULL); + + viona_neti->nin_name = "viona"; + viona_neti->nin_create = viona_neti_create; + viona_neti->nin_shutdown = viona_neti_shutdown; + viona_neti->nin_destroy = viona_neti_destroy; + /* This can only fail if we've registered ourselves multiple times */ + VERIFY3S(net_instance_register(viona_neti), ==, DDI_SUCCESS); +} + +void +viona_neti_detach(void) +{ + /* This can only fail if we've not registered previously */ + VERIFY3S(net_instance_unregister(viona_neti), ==, DDI_SUCCESS); + net_instance_free(viona_neti); + viona_neti = NULL; + + list_destroy(&viona_neti_list); + mutex_destroy(&viona_neti_lock); +} diff --git a/usr/src/uts/i86pc/io/viona/viona_impl.h b/usr/src/uts/i86pc/io/viona/viona_impl.h new file mode 100644 index 0000000000..5471b611a4 --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_impl.h @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VIONA_IMPL_H +#define _VIONA_IMPL_H + +#include <sys/ddi.h> +#include <sys/list.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> +#include <sys/uio.h> + +#include <sys/mac_client.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_priv.h> +#include <sys/neti.h> +#include <inet/ip.h> +#include <inet/tcp.h> + +#include <sys/vmm_drv.h> +#include <sys/viona_io.h> + +struct viona_link; +typedef struct viona_link viona_link_t; +struct viona_desb; +typedef struct viona_desb viona_desb_t; +struct viona_net; +typedef struct viona_neti viona_neti_t; + +enum viona_ring_state { + VRS_RESET = 0x0, /* just allocated or reset */ + VRS_SETUP = 0x1, /* addrs setup and starting worker thread */ + VRS_INIT = 0x2, /* worker thread started & waiting to run */ + VRS_RUN = 0x3, /* running work routine */ + VRS_STOP = 0x4, /* worker is exiting */ +}; +enum viona_ring_state_flags { + VRSF_REQ_START = 0x1, /* start running from INIT state */ + VRSF_REQ_STOP = 0x2, /* stop running, clean up, goto RESET state */ + VRSF_RENEW = 0x4, /* ring renewing lease */ +}; + +typedef struct viona_vring { + viona_link_t *vr_link; + + kmutex_t vr_lock; + kcondvar_t vr_cv; + uint16_t vr_state; + uint16_t vr_state_flags; + uint_t vr_xfer_outstanding; + kthread_t *vr_worker_thread; + vmm_lease_t *vr_lease; + + /* ring-sized resources for TX activity */ + viona_desb_t *vr_txdesb; + struct iovec *vr_txiov; + + uint_t vr_intr_enabled; + uint64_t vr_msi_addr; + uint64_t vr_msi_msg; + + /* Internal ring-related state */ + kmutex_t vr_a_mutex; /* sync consumers of 'avail' */ + kmutex_t vr_u_mutex; /* sync consumers of 'used' */ + uint64_t vr_pa; + uint16_t vr_size; + uint16_t vr_mask; /* cached from vr_size */ + uint16_t vr_cur_aidx; /* trails behind 'avail_idx' */ + + /* Host-context pointers to the queue */ + volatile struct virtio_desc *vr_descr; + + volatile uint16_t *vr_avail_flags; + volatile uint16_t *vr_avail_idx; + volatile uint16_t *vr_avail_ring; + volatile uint16_t *vr_avail_used_event; + + volatile uint16_t *vr_used_flags; + volatile uint16_t *vr_used_idx; + volatile struct virtio_used *vr_used_ring; + volatile uint16_t *vr_used_avail_event; + + /* Per-ring error condition statistics */ + struct viona_ring_stats { + uint64_t rs_ndesc_too_high; + uint64_t rs_bad_idx; + uint64_t rs_indir_bad_len; + uint64_t rs_indir_bad_nest; + uint64_t rs_indir_bad_next; + uint64_t rs_no_space; + uint64_t rs_too_many_desc; + uint64_t rs_desc_bad_len; + + uint64_t rs_bad_ring_addr; + + uint64_t rs_fail_hcksum; + uint64_t rs_fail_hcksum6; + uint64_t rs_fail_hcksum_proto; + + uint64_t rs_bad_rx_frame; + uint64_t rs_rx_merge_overrun; + uint64_t rs_rx_merge_underrun; + uint64_t rs_rx_pad_short; + uint64_t rs_rx_mcast_check; + uint64_t rs_too_short; + uint64_t rs_tx_absent; + + uint64_t rs_rx_hookdrop; + uint64_t rs_tx_hookdrop; + } vr_stats; +} viona_vring_t; + +struct viona_link { + vmm_hold_t *l_vm_hold; + boolean_t l_destroyed; + + viona_vring_t l_vrings[VIONA_VQ_MAX]; + + uint32_t l_features; + uint32_t l_features_hw; + uint32_t l_cap_csum; + + uintptr_t l_notify_ioport; + void *l_notify_cookie; + + datalink_id_t l_linkid; + mac_handle_t l_mh; + mac_client_handle_t l_mch; + mac_promisc_handle_t l_mph; + + pollhead_t l_pollhead; + + viona_neti_t *l_neti; +}; + +typedef struct viona_nethook { + net_handle_t vnh_neti; + hook_family_t vnh_family; + hook_event_t vnh_event_in; + hook_event_t vnh_event_out; + hook_event_token_t vnh_token_in; + hook_event_token_t vnh_token_out; + boolean_t vnh_hooked; +} viona_nethook_t; + +struct viona_neti { + list_node_t vni_node; + + netid_t vni_netid; + zoneid_t vni_zid; + + viona_nethook_t vni_nethook; + + kmutex_t vni_lock; /* Protects remaining members */ + kcondvar_t vni_ref_change; /* Protected by vni_lock */ + uint_t vni_ref; /* Protected by vni_lock */ + list_t vni_dev_list; /* Protected by vni_lock */ +}; + +typedef struct used_elem { + uint16_t id; + uint32_t len; +} used_elem_t; + +typedef struct viona_soft_state { + kmutex_t ss_lock; + viona_link_t *ss_link; + list_node_t ss_node; +} viona_soft_state_t; + +#pragma pack(1) +struct virtio_desc { + uint64_t vd_addr; + uint32_t vd_len; + uint16_t vd_flags; + uint16_t vd_next; +}; + +struct virtio_used { + uint32_t vu_idx; + uint32_t vu_tlen; +}; + +struct virtio_net_mrgrxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +}; + +struct virtio_net_hdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; +}; +#pragma pack() + +#define VRING_NEED_BAIL(ring, proc) \ + (((ring)->vr_state_flags & VRSF_REQ_STOP) != 0 || \ + ((proc)->p_flag & SEXITING) != 0) + + +#define VNETHOOK_INTERESTED_IN(neti) \ + (neti)->vni_nethook.vnh_event_in.he_interested +#define VNETHOOK_INTERESTED_OUT(neti) \ + (neti)->vni_nethook.vnh_event_out.he_interested + + +#define VIONA_PROBE(name) DTRACE_PROBE(viona__##name) +#define VIONA_PROBE1(name, arg1, arg2) \ + DTRACE_PROBE1(viona__##name, arg1, arg2) +#define VIONA_PROBE2(name, arg1, arg2, arg3, arg4) \ + DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4) +#define VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6) \ + DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6) +#define VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \ + arg9, arg10) \ + DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \ + arg8, arg9, arg10) +#define VIONA_PROBE_BAD_RING_ADDR(r, a) \ + VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a)) + +#define VIONA_RING_STAT_INCR(r, name) \ + (((r)->vr_stats.rs_ ## name)++) + + +#define VIONA_MAX_HDRS_LEN (sizeof (struct ether_vlan_header) + \ + IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH) + +#define VRING_AVAIL_F_NO_INTERRUPT 1 +#define VRING_USED_F_NO_NOTIFY 1 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +#define VIRTIO_NET_HDR_F_NEEDS_CSUM (1 << 0) +#define VIRTIO_NET_HDR_F_DATA_VALID (1 << 1) + +#define VIRTIO_NET_HDR_GSO_NONE 0 +#define VIRTIO_NET_HDR_GSO_TCPV4 1 + +#define VIRTIO_NET_F_CSUM (1 << 0) +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can accept TSO */ +#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can accept TSO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX bufs */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* cfg status field present */ +#define VIRTIO_F_RING_NOTIFY_ON_EMPTY (1 << 24) +#define VIRTIO_F_RING_INDIRECT_DESC (1 << 28) +#define VIRTIO_F_RING_EVENT_IDX (1 << 29) + + +void viona_ring_alloc(viona_link_t *, viona_vring_t *); +void viona_ring_free(viona_vring_t *); +int viona_ring_reset(viona_vring_t *, boolean_t); +int viona_ring_init(viona_link_t *, uint16_t, uint16_t, uint64_t); +boolean_t viona_ring_lease_renew(viona_vring_t *); +int vq_popchain(viona_vring_t *, struct iovec *, uint_t, uint16_t *); +void vq_pushchain(viona_vring_t *, uint32_t, uint16_t); +void vq_pushchain_many(viona_vring_t *, uint_t, used_elem_t *); +void viona_intr_ring(viona_vring_t *ring); + +void viona_rx_init(void); +void viona_rx_fini(void); +int viona_rx_set(viona_link_t *); +void viona_rx_clear(viona_link_t *); +void viona_worker_rx(viona_vring_t *, viona_link_t *); + +extern kmutex_t viona_force_copy_lock; +void viona_worker_tx(viona_vring_t *, viona_link_t *); +void viona_tx_ring_alloc(viona_vring_t *, const uint16_t); +void viona_tx_ring_free(viona_vring_t *, const uint16_t); + +void viona_neti_attach(void); +void viona_neti_detach(void); +viona_neti_t *viona_neti_lookup_by_zid(zoneid_t); +void viona_neti_rele(viona_neti_t *); +int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t); + +#endif /* _VIONA_IMPL_H */ diff --git a/usr/src/uts/i86pc/io/viona/viona_main.c b/usr/src/uts/i86pc/io/viona/viona_main.c new file mode 100644 index 0000000000..f51a1f9b12 --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_main.c @@ -0,0 +1,991 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +/* + * viona - VirtIO-Net, Accelerated + * + * The purpose of viona is to provide high performance virtio-net devices to + * bhyve guests. It does so by sitting directly atop MAC, skipping all of the + * DLS/DLD stack. + * + * -------------------- + * General Architecture + * -------------------- + * + * A single viona instance is comprised of a "link" handle and two "rings". + * After opening the viona device, it must be associated with a MAC network + * interface and a bhyve (vmm) instance to form its link resource. This is + * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are + * passed in to perform the initialization. With the MAC client opened, and a + * driver handle to the vmm instance established, the device is ready to be + * configured by the guest. + * + * The userspace portion of bhyve, which interfaces with the PCI device + * emulation framework, is meant to stay out of the datapath if at all + * possible. Configuration changes made via PCI are mapped to actions which + * will steer the operation of the in-kernel logic. + * + * + * ----------- + * Ring Basics + * ----------- + * + * Each viona link has two viona_vring_t entities, RX and TX, for handling data + * transfers to and from the guest. They represent an interface to the + * standard virtio ring structures. When intiailized and active, each ring is + * backed by a kernel worker thread (parented to the bhyve process for the + * instance) which handles ring events. The RX worker has the simple task of + * watching for ring shutdown conditions. The TX worker does that in addition + * to processing all requests to transmit data. Data destined for the guest is + * delivered directly by MAC to viona_rx() when the ring is active. + * + * + * ----------- + * Ring States + * ----------- + * + * The viona_vring_t instances follow a simple path through the possible state + * values represented in virtio_vring_t`vr_state: + * + * +<--------------------------------------------+ + * | | + * V ^ + * +-----------+ This is the initial state when a link is created or + * | VRS_RESET | when the ring has been explicitly reset. + * +-----------+ + * | ^ + * |---* ioctl(VNA_IOC_RING_INIT) issued | + * | | + * | ^ + * V + * +-----------+ The ring parameters (size, guest physical addresses) + * | VRS_SETUP | have been set and start-up of the ring worker thread + * +-----------+ has begun. + * | ^ + * | | + * |---* ring worker thread begins execution | + * | | + * +-------------------------------------------->+ + * | | ^ + * | | + * | * If ring shutdown is requested (by ioctl or impending + * | bhyve process death) while the worker thread is + * | starting, the worker will transition the ring to + * | VRS_RESET and exit. + * | ^ + * | | + * | ^ + * V + * +-----------+ The worker thread associated with the ring has started + * | VRS_INIT | executing. It has allocated any extra resources needed + * +-----------+ for the ring to operate. + * | ^ + * | | + * +-------------------------------------------->+ + * | | ^ + * | | + * | * If ring shutdown is requested while the worker is + * | waiting in VRS_INIT, it will free any extra resources + * | and transition to VRS_RESET. + * | ^ + * | | + * |--* ioctl(VNA_IOC_RING_KICK) issued | + * | ^ + * V + * +-----------+ The worker thread associated with the ring is executing + * | VRS_RUN | workload specific to that ring. + * +-----------+ + * | ^ + * |---* ioctl(VNA_IOC_RING_RESET) issued | + * | (or bhyve process begins exit) ^ + * | + * +-----------+ The worker thread associated with the ring is in the + * | VRS_STOP | process of exiting. All outstanding TX and RX + * +-----------+ requests are allowed to complete, but new requests + * | must be ignored. + * | ^ + * | | + * +-------------------------------------------->+ + * + * + * While the worker thread is not running, changes to vr_state are only made by + * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts + * the worker, and sets the ring state to VRS_SETUP. Once the worker thread + * has been started, only it may perform ring state transitions (still under + * the protection of vr_lock), when requested by outside consumers via + * vr_state_flags or when the containing bhyve process initiates an exit. + * + * + * ---------------------------- + * Transmission mblk_t Handling + * ---------------------------- + * + * For incoming frames destined for a bhyve guest, the data must first land in + * a host OS buffer from the physical NIC before it is copied into the awaiting + * guest buffer(s). Outbound frames transmitted by the guest are not bound by + * this limitation and can avoid extra copying before the buffers are accessed + * directly by the NIC. When a guest designates buffers to be transmitted, + * viona translates the guest-physical addresses contained in the ring + * descriptors to host-virtual addresses via vmm_dr_gpa2kva(). That pointer is + * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc(). + * Doing so increments vr_xfer_outstanding, preventing the ring from being + * reset (allowing the link to drop its vmm handle to the guest) until all + * transmit mblks referencing guest memory have been processed. Allocation of + * the viona_desb_t entries is done during the VRS_INIT stage of the ring + * worker thread. The ring size informs that allocation as the number of + * concurrent transmissions is limited by the number of descriptors in the + * ring. This minimizes allocation in the transmit hot-path by aqcuiring those + * fixed-size resources during initialization. + * + * This optimization depends on the underlying NIC driver freeing the mblks in + * a timely manner after they have been transmitted by the hardware. Some + * drivers have been found to flush TX descriptors only when new transmissions + * are initiated. This means that there is no upper bound to the time needed + * for an mblk to be flushed and can stall bhyve guests from shutting down + * since their memory must be free of viona TX references prior to clean-up. + * + * This expectation of deterministic mblk_t processing is likely the reason + * behind the notable exception to the zero-copy TX path: systems with 'bnxe' + * loaded will copy transmit data into fresh buffers rather than passing up + * zero-copy mblks. It is a hold-over from the original viona sources provided + * by Pluribus and its continued necessity has not been confirmed. + * + * + * ---------------------------- + * Ring Notification Fast-paths + * ---------------------------- + * + * Device operation for viona requires that notifications flow to and from the + * guest to indicate certain ring conditions. In order to minimize latency and + * processing overhead, the notification procedures are kept in-kernel whenever + * possible. + * + * Guest-to-host notifications, when new available descriptors have been placed + * in the ring, are posted via the 'queue notify' address in the virtio BAR. + * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to + * install a callback hook on an ioport address. Guest exits for accesses to + * viona-hooked ioport addresses will result in direct calls to notify the + * appropriate ring worker without a trip to userland. + * + * Host-to-guest notifications in the form of interrupts enjoy similar + * acceleration. Each viona ring can be configured to send MSI notifications + * to the guest as virtio conditions dictate. This in-kernel interrupt + * configuration is kept synchronized through viona ioctls which are utilized + * during writes to the associated PCI config registers or MSI-X BAR. + * + * Guests which do not utilize MSI-X will result in viona falling back to the + * slow path for interrupts. It will poll(2) the viona handle, receiving + * notification when ring events necessitate the assertion of an interrupt. + * + * + * --------------- + * Nethook Support + * --------------- + * + * Viona provides four nethook events that consumers (e.g. ipf) can hook into + * to intercept packets as they go up or down the stack. Unfortunately, + * the nethook framework does not understand raw packets, so we can only + * generate events (in, out) for IPv4 and IPv6 packets. At driver attach, + * we register callbacks with the neti (netinfo) module that will be invoked + * for each netstack already present, as well as for any additional netstack + * instances created as the system operates. These callbacks will + * register/unregister the hooks with the nethook framework for each + * netstack instance. This registration occurs prior to creating any + * viona instances for a given netstack, and the unregistration for a netstack + * instance occurs after all viona instances of the netstack instance have + * been deleted. + */ + +#include <sys/conf.h> +#include <sys/file.h> +#include <sys/stat.h> + +#include <sys/dlpi.h> + +#include "viona_impl.h" + + +#define VIONA_NAME "Virtio Network Accelerator" +#define VIONA_CTL_MINOR 0 +#define VIONA_CLI_NAME "viona" /* MAC client name */ + + +/* + * Host capabilities. + */ +#define VIONA_S_HOSTCAPS ( \ + VIRTIO_NET_F_GUEST_CSUM | \ + VIRTIO_NET_F_MAC | \ + VIRTIO_NET_F_GUEST_TSO4 | \ + VIRTIO_NET_F_MRG_RXBUF | \ + VIRTIO_NET_F_STATUS | \ + VIRTIO_F_RING_NOTIFY_ON_EMPTY | \ + VIRTIO_F_RING_INDIRECT_DESC) + +/* MAC_CAPAB_HCKSUM specifics of interest */ +#define VIONA_CAP_HCKSUM_INTEREST \ + (HCKSUM_INET_PARTIAL | \ + HCKSUM_INET_FULL_V4 | \ + HCKSUM_INET_FULL_V6) + +static void *viona_state; +static dev_info_t *viona_dip; +static id_space_t *viona_minors; + + +static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, + void **result); +static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); +static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); +static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); +static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, + cred_t *credp, int *rval); +static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp); + +static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *); +static int viona_ioc_delete(viona_soft_state_t *, boolean_t); + +static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t); +static int viona_ioc_ring_init(viona_link_t *, void *, int); +static int viona_ioc_ring_reset(viona_link_t *, uint_t); +static int viona_ioc_ring_kick(viona_link_t *, uint_t); +static int viona_ioc_ring_set_msi(viona_link_t *, void *, int); +static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t); +static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *); + +static struct cb_ops viona_cb_ops = { + viona_open, + viona_close, + nodev, + nodev, + nodev, + nodev, + nodev, + viona_ioctl, + nodev, + nodev, + nodev, + viona_chpoll, + ddi_prop_op, + 0, + D_MP | D_NEW | D_HOTPLUG, + CB_REV, + nodev, + nodev +}; + +static struct dev_ops viona_ops = { + DEVO_REV, + 0, + viona_info, + nulldev, + nulldev, + viona_attach, + viona_detach, + nodev, + &viona_cb_ops, + NULL, + ddi_power, + ddi_quiesce_not_needed +}; + +static struct modldrv modldrv = { + &mod_driverops, + VIONA_NAME, + &viona_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +int +_init(void) +{ + int ret; + + ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0); + if (ret != 0) { + return (ret); + } + + viona_minors = id_space_create("viona_minors", + VIONA_CTL_MINOR + 1, UINT16_MAX); + viona_rx_init(); + mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL); + + ret = mod_install(&modlinkage); + if (ret != 0) { + ddi_soft_state_fini(&viona_state); + id_space_destroy(viona_minors); + viona_rx_fini(); + mutex_destroy(&viona_force_copy_lock); + } + + return (ret); +} + +int +_fini(void) +{ + int ret; + + ret = mod_remove(&modlinkage); + if (ret != 0) { + return (ret); + } + + ddi_soft_state_fini(&viona_state); + id_space_destroy(viona_minors); + viona_rx_fini(); + mutex_destroy(&viona_force_copy_lock); + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* ARGSUSED */ +static int +viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)viona_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + return (error); +} + +static int +viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + viona_neti_attach(); + + viona_dip = dip; + ddi_report_dev(viona_dip); + + return (DDI_SUCCESS); +} + +static int +viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + dev_info_t *old_dip = viona_dip; + + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + VERIFY(old_dip != NULL); + + viona_neti_detach(); + viona_dip = NULL; + ddi_remove_minor_node(old_dip, NULL); + + return (DDI_SUCCESS); +} + +static int +viona_open(dev_t *devp, int flag, int otype, cred_t *credp) +{ + int minor; + viona_soft_state_t *ss; + + if (otype != OTYP_CHR) { + return (EINVAL); + } +#if 0 + /* + * XXX-mg: drv_priv() is wrong, but I'm not sure what is right. + * Should the check be at open() or ioctl()? + */ + if (drv_priv(credp) != 0) { + return (EPERM); + } +#endif + if (getminor(*devp) != VIONA_CTL_MINOR) { + return (ENXIO); + } + + minor = id_alloc_nosleep(viona_minors); + if (minor == -1) { + /* All minors are busy */ + return (EBUSY); + } + if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { + id_free(viona_minors, minor); + return (ENOMEM); + } + + ss = ddi_get_soft_state(viona_state, minor); + mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL); + *devp = makedevice(getmajor(*devp), minor); + + return (0); +} + +static int +viona_close(dev_t dev, int flag, int otype, cred_t *credp) +{ + int minor; + viona_soft_state_t *ss; + + if (otype != OTYP_CHR) { + return (EINVAL); + } + + minor = getminor(dev); + + ss = ddi_get_soft_state(viona_state, minor); + if (ss == NULL) { + return (ENXIO); + } + + VERIFY0(viona_ioc_delete(ss, B_TRUE)); + VERIFY(!list_link_active(&ss->ss_node)); + ddi_soft_state_free(viona_state, minor); + id_free(viona_minors, minor); + + return (0); +} + +static int +viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) +{ + viona_soft_state_t *ss; + void *dptr = (void *)data; + int err = 0, val; + viona_link_t *link; + + ss = ddi_get_soft_state(viona_state, getminor(dev)); + if (ss == NULL) { + return (ENXIO); + } + + switch (cmd) { + case VNA_IOC_CREATE: + return (viona_ioc_create(ss, dptr, md, cr)); + case VNA_IOC_DELETE: + return (viona_ioc_delete(ss, B_FALSE)); + default: + break; + } + + mutex_enter(&ss->ss_lock); + if ((link = ss->ss_link) == NULL || link->l_destroyed || + vmm_drv_release_reqd(link->l_vm_hold)) { + mutex_exit(&ss->ss_lock); + return (ENXIO); + } + + switch (cmd) { + case VNA_IOC_GET_FEATURES: + val = VIONA_S_HOSTCAPS | link->l_features_hw; + if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) { + err = EFAULT; + } + break; + case VNA_IOC_SET_FEATURES: + if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) { + err = EFAULT; + break; + } + val &= (VIONA_S_HOSTCAPS | link->l_features_hw); + + if ((val & VIRTIO_NET_F_CSUM) == 0) + val &= ~VIRTIO_NET_F_HOST_TSO4; + + if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) + val &= ~VIRTIO_NET_F_GUEST_TSO4; + + link->l_features = val; + break; + case VNA_IOC_RING_INIT: + err = viona_ioc_ring_init(link, dptr, md); + break; + case VNA_IOC_RING_RESET: + err = viona_ioc_ring_reset(link, (uint_t)data); + break; + case VNA_IOC_RING_KICK: + err = viona_ioc_ring_kick(link, (uint_t)data); + break; + case VNA_IOC_RING_SET_MSI: + err = viona_ioc_ring_set_msi(link, dptr, md); + break; + case VNA_IOC_RING_INTR_CLR: + err = viona_ioc_ring_intr_clear(link, (uint_t)data); + break; + case VNA_IOC_INTR_POLL: + err = viona_ioc_intr_poll(link, dptr, md, rv); + break; + case VNA_IOC_SET_NOTIFY_IOP: + err = viona_ioc_set_notify_ioport(link, (uint_t)data); + break; + default: + err = ENOTTY; + break; + } + + mutex_exit(&ss->ss_lock); + return (err); +} + +static int +viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + viona_soft_state_t *ss; + viona_link_t *link; + + ss = ddi_get_soft_state(viona_state, getminor(dev)); + if (ss == NULL) { + return (ENXIO); + } + + mutex_enter(&ss->ss_lock); + if ((link = ss->ss_link) == NULL || link->l_destroyed) { + mutex_exit(&ss->ss_lock); + return (ENXIO); + } + + *reventsp = 0; + if ((events & POLLRDBAND) != 0) { + for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { + if (link->l_vrings[i].vr_intr_enabled != 0) { + *reventsp |= POLLRDBAND; + break; + } + } + } + if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { + *phpp = &link->l_pollhead; + } + mutex_exit(&ss->ss_lock); + + return (0); +} + +static void +viona_get_mac_capab(viona_link_t *link) +{ + mac_handle_t mh = link->l_mh; + uint32_t cap = 0; + mac_capab_lso_t lso_cap; + + link->l_features_hw = 0; + if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) { + /* + * Only report HW checksum ability if the underlying MAC + * resource is capable of populating the L4 header. + */ + if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) { + link->l_features_hw |= VIRTIO_NET_F_CSUM; + } + link->l_cap_csum = cap; + } + + if ((link->l_features_hw & VIRTIO_NET_F_CSUM) && + mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) { + /* + * Virtio doesn't allow for negotiating a maximum LSO + * packet size. We have to assume that the guest may + * send a maximum length IP packet. Make sure the + * underlying MAC can handle an LSO of this size. + */ + if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) && + lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) + link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4; + } +} + +static int +viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr) +{ + vioc_create_t kvc; + viona_link_t *link = NULL; + char cli_name[MAXNAMELEN]; + int err = 0; + file_t *fp; + vmm_hold_t *hold = NULL; + viona_neti_t *nip = NULL; + zoneid_t zid; + + ASSERT(MUTEX_NOT_HELD(&ss->ss_lock)); + + if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) { + return (EFAULT); + } + + zid = crgetzoneid(cr); + nip = viona_neti_lookup_by_zid(zid); + if (nip == NULL) { + return (EIO); + } + + if (!nip->vni_nethook.vnh_hooked) { + viona_neti_rele(nip); + return (EIO); + } + + mutex_enter(&ss->ss_lock); + if (ss->ss_link != NULL) { + mutex_exit(&ss->ss_lock); + viona_neti_rele(nip); + return (EEXIST); + } + + if ((fp = getf(kvc.c_vmfd)) == NULL) { + err = EBADF; + goto bail; + } + err = vmm_drv_hold(fp, cr, &hold); + releasef(kvc.c_vmfd); + if (err != 0) { + goto bail; + } + + link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); + link->l_linkid = kvc.c_linkid; + link->l_vm_hold = hold; + + err = mac_open_by_linkid(link->l_linkid, &link->l_mh); + if (err != 0) { + goto bail; + } + + viona_get_mac_capab(link); + + (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME, + link->l_linkid); + err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); + if (err != 0) { + goto bail; + } + + viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]); + viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]); + + if ((err = viona_rx_set(link)) != 0) { + viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); + viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); + goto bail; + } + + link->l_neti = nip; + ss->ss_link = link; + mutex_exit(&ss->ss_lock); + + mutex_enter(&nip->vni_lock); + list_insert_tail(&nip->vni_dev_list, ss); + mutex_exit(&nip->vni_lock); + + return (0); + +bail: + if (link != NULL) { + if (link->l_mch != NULL) { + mac_client_close(link->l_mch, 0); + } + if (link->l_mh != NULL) { + mac_close(link->l_mh); + } + kmem_free(link, sizeof (viona_link_t)); + } + if (hold != NULL) { + vmm_drv_rele(hold); + } + viona_neti_rele(nip); + + mutex_exit(&ss->ss_lock); + return (err); +} + +static int +viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close) +{ + viona_link_t *link; + viona_neti_t *nip = NULL; + + mutex_enter(&ss->ss_lock); + if ((link = ss->ss_link) == NULL) { + /* Link destruction already complete */ + mutex_exit(&ss->ss_lock); + return (0); + } + + if (link->l_destroyed) { + /* + * Link destruction has been started by another thread, but has + * not completed. This condition should be impossible to + * encounter when performing the on-close destroy of the link, + * since racing ioctl accessors must necessarily be absent. + */ + VERIFY(!on_close); + mutex_exit(&ss->ss_lock); + return (EAGAIN); + } + /* + * The link deletion cannot fail after this point, continuing until its + * successful completion is reached. + */ + link->l_destroyed = B_TRUE; + + /* + * Tear down the IO port hook so it cannot be used to kick any of the + * rings which are about to be reset and stopped. + */ + VERIFY0(viona_ioc_set_notify_ioport(link, 0)); + mutex_exit(&ss->ss_lock); + + /* + * Return the rings to their reset state, ignoring any possible + * interruptions from signals. + */ + VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE)); + VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE)); + + mutex_enter(&ss->ss_lock); + if (link->l_mch != NULL) { + /* Unhook the receive callbacks and close out the client */ + viona_rx_clear(link); + mac_client_close(link->l_mch, 0); + } + if (link->l_mh != NULL) { + mac_close(link->l_mh); + } + if (link->l_vm_hold != NULL) { + vmm_drv_rele(link->l_vm_hold); + link->l_vm_hold = NULL; + } + + nip = link->l_neti; + link->l_neti = NULL; + + viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); + viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); + pollhead_clean(&link->l_pollhead); + ss->ss_link = NULL; + mutex_exit(&ss->ss_lock); + + mutex_enter(&nip->vni_lock); + list_remove(&nip->vni_dev_list, ss); + mutex_exit(&nip->vni_lock); + + viona_neti_rele(nip); + + kmem_free(link, sizeof (viona_link_t)); + return (0); +} + +static int +viona_ioc_ring_init(viona_link_t *link, void *udata, int md) +{ + vioc_ring_init_t kri; + int err; + + if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) { + return (EFAULT); + } + + err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr); + + return (err); +} + +static int +viona_ioc_ring_reset(viona_link_t *link, uint_t idx) +{ + viona_vring_t *ring; + + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + ring = &link->l_vrings[idx]; + + return (viona_ring_reset(ring, B_TRUE)); +} + +static int +viona_ioc_ring_kick(viona_link_t *link, uint_t idx) +{ + viona_vring_t *ring; + int err; + + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + ring = &link->l_vrings[idx]; + + mutex_enter(&ring->vr_lock); + switch (ring->vr_state) { + case VRS_SETUP: + /* + * An early kick to a ring which is starting its worker thread + * is fine. Once that thread is active, it will process the + * start-up request immediately. + */ + /* FALLTHROUGH */ + case VRS_INIT: + ring->vr_state_flags |= VRSF_REQ_START; + /* FALLTHROUGH */ + case VRS_RUN: + cv_broadcast(&ring->vr_cv); + err = 0; + break; + default: + err = EBUSY; + break; + } + mutex_exit(&ring->vr_lock); + + return (err); +} + +static int +viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md) +{ + vioc_ring_msi_t vrm; + viona_vring_t *ring; + + if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) { + return (EFAULT); + } + if (vrm.rm_index >= VIONA_VQ_MAX) { + return (EINVAL); + } + + ring = &link->l_vrings[vrm.rm_index]; + mutex_enter(&ring->vr_lock); + ring->vr_msi_addr = vrm.rm_addr; + ring->vr_msi_msg = vrm.rm_msg; + mutex_exit(&ring->vr_lock); + + return (0); +} + +static int +viona_notify_wcb(void *arg, uintptr_t ioport, uint_t sz, uint64_t val) +{ + viona_link_t *link = (viona_link_t *)arg; + uint16_t vq = (uint16_t)val; + + if (ioport != link->l_notify_ioport || sz != sizeof (uint16_t)) { + return (EINVAL); + } + return (viona_ioc_ring_kick(link, vq)); +} + +static int +viona_ioc_set_notify_ioport(viona_link_t *link, uint_t ioport) +{ + int err = 0; + + if (link->l_notify_ioport != 0) { + vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie); + link->l_notify_ioport = 0; + } + + if (ioport != 0) { + err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, NULL, + viona_notify_wcb, (void *)link, &link->l_notify_cookie); + if (err == 0) { + link->l_notify_ioport = ioport; + } + } + return (err); +} + +static int +viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx) +{ + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + + link->l_vrings[idx].vr_intr_enabled = 0; + return (0); +} + +static int +viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv) +{ + uint_t cnt = 0; + vioc_intr_poll_t vip; + + for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { + uint_t val = link->l_vrings[i].vr_intr_enabled; + + vip.vip_status[i] = val; + if (val != 0) { + cnt++; + } + } + + if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) { + return (EFAULT); + } + *rv = (int)cnt; + return (0); +} diff --git a/usr/src/uts/i86pc/io/viona/viona_ring.c b/usr/src/uts/i86pc/io/viona/viona_ring.c new file mode 100644 index 0000000000..5ba6fad963 --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_ring.c @@ -0,0 +1,638 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + + +#include <sys/disp.h> + +#include "viona_impl.h" + +#define VRING_ALIGN 4096 +#define VRING_MAX_LEN 32768 + +static boolean_t viona_ring_map(viona_vring_t *); +static void viona_ring_unmap(viona_vring_t *); +static kthread_t *viona_create_worker(viona_vring_t *); + +static void * +viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len) +{ + ASSERT3P(ring->vr_lease, !=, NULL); + + return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len)); +} + +static boolean_t +viona_ring_lease_expire_cb(void *arg) +{ + viona_vring_t *ring = arg; + + cv_broadcast(&ring->vr_cv); + + /* The lease will be broken asynchronously. */ + return (B_FALSE); +} + +static void +viona_ring_lease_drop(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + if (ring->vr_lease != NULL) { + vmm_hold_t *hold = ring->vr_link->l_vm_hold; + + ASSERT(hold != NULL); + + /* + * Without an active lease, the ring mappings cannot be + * considered valid. + */ + viona_ring_unmap(ring); + + vmm_drv_lease_break(hold, ring->vr_lease); + ring->vr_lease = NULL; + } +} + +boolean_t +viona_ring_lease_renew(viona_vring_t *ring) +{ + vmm_hold_t *hold = ring->vr_link->l_vm_hold; + + ASSERT(hold != NULL); + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + viona_ring_lease_drop(ring); + + /* + * Lease renewal will fail if the VM has requested that all holds be + * cleaned up. + */ + ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb, + ring); + if (ring->vr_lease != NULL) { + /* A ring undergoing renewal will need valid guest mappings */ + if (ring->vr_pa != 0 && ring->vr_size != 0) { + /* + * If new mappings cannot be established, consider the + * lease renewal a failure. + */ + if (!viona_ring_map(ring)) { + viona_ring_lease_drop(ring); + return (B_FALSE); + } + } + } + return (ring->vr_lease != NULL); +} + +void +viona_ring_alloc(viona_link_t *link, viona_vring_t *ring) +{ + ring->vr_link = link; + mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL); + mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL); +} + +static void +viona_ring_misc_free(viona_vring_t *ring) +{ + const uint_t qsz = ring->vr_size; + + viona_tx_ring_free(ring, qsz); +} + +void +viona_ring_free(viona_vring_t *ring) +{ + mutex_destroy(&ring->vr_lock); + cv_destroy(&ring->vr_cv); + mutex_destroy(&ring->vr_a_mutex); + mutex_destroy(&ring->vr_u_mutex); + ring->vr_link = NULL; +} + +int +viona_ring_init(viona_link_t *link, uint16_t idx, uint16_t qsz, uint64_t pa) +{ + viona_vring_t *ring; + kthread_t *t; + int err = 0; + + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) { + return (EINVAL); + } + + ring = &link->l_vrings[idx]; + mutex_enter(&ring->vr_lock); + if (ring->vr_state != VRS_RESET) { + mutex_exit(&ring->vr_lock); + return (EBUSY); + } + VERIFY(ring->vr_state_flags == 0); + + ring->vr_lease = NULL; + if (!viona_ring_lease_renew(ring)) { + err = EBUSY; + goto fail; + } + + ring->vr_size = qsz; + ring->vr_mask = (ring->vr_size - 1); + ring->vr_pa = pa; + if (!viona_ring_map(ring)) { + err = EINVAL; + goto fail; + } + + /* Initialize queue indexes */ + ring->vr_cur_aidx = 0; + + if (idx == VIONA_VQ_TX) { + viona_tx_ring_alloc(ring, qsz); + } + + /* Zero out MSI-X configuration */ + ring->vr_msi_addr = 0; + ring->vr_msi_msg = 0; + + /* Clear the stats */ + bzero(&ring->vr_stats, sizeof (ring->vr_stats)); + + t = viona_create_worker(ring); + if (t == NULL) { + err = ENOMEM; + goto fail; + } + ring->vr_worker_thread = t; + ring->vr_state = VRS_SETUP; + cv_broadcast(&ring->vr_cv); + mutex_exit(&ring->vr_lock); + return (0); + +fail: + viona_ring_lease_drop(ring); + viona_ring_misc_free(ring); + ring->vr_size = 0; + ring->vr_mask = 0; + mutex_exit(&ring->vr_lock); + return (err); +} + +int +viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals) +{ + mutex_enter(&ring->vr_lock); + if (ring->vr_state == VRS_RESET) { + mutex_exit(&ring->vr_lock); + return (0); + } + + if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) { + ring->vr_state_flags |= VRSF_REQ_STOP; + cv_broadcast(&ring->vr_cv); + } + while (ring->vr_state != VRS_RESET) { + if (!heed_signals) { + cv_wait(&ring->vr_cv, &ring->vr_lock); + } else { + int rs; + + rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + if (rs <= 0 && ring->vr_state != VRS_RESET) { + mutex_exit(&ring->vr_lock); + return (EINTR); + } + } + } + viona_ring_lease_drop(ring); + mutex_exit(&ring->vr_lock); + return (0); +} + +static boolean_t +viona_ring_map(viona_vring_t *ring) +{ + uint64_t pos = ring->vr_pa; + const uint16_t qsz = ring->vr_size; + + ASSERT3U(qsz, !=, 0); + ASSERT3U(pos, !=, 0); + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + const size_t desc_sz = qsz * sizeof (struct virtio_desc); + ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz); + if (ring->vr_descr == NULL) { + goto fail; + } + pos += desc_sz; + + const size_t avail_sz = (qsz + 3) * sizeof (uint16_t); + ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz); + if (ring->vr_avail_flags == NULL) { + goto fail; + } + ring->vr_avail_idx = ring->vr_avail_flags + 1; + ring->vr_avail_ring = ring->vr_avail_flags + 2; + ring->vr_avail_used_event = ring->vr_avail_ring + qsz; + pos += avail_sz; + + const size_t used_sz = (qsz * sizeof (struct virtio_used)) + + (sizeof (uint16_t) * 3); + pos = P2ROUNDUP(pos, VRING_ALIGN); + ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz); + if (ring->vr_used_flags == NULL) { + goto fail; + } + ring->vr_used_idx = ring->vr_used_flags + 1; + ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2); + ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz); + + return (B_TRUE); + +fail: + viona_ring_unmap(ring); + return (B_FALSE); +} + +static void +viona_ring_unmap(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + ring->vr_descr = NULL; + ring->vr_avail_flags = NULL; + ring->vr_avail_idx = NULL; + ring->vr_avail_ring = NULL; + ring->vr_avail_used_event = NULL; + ring->vr_used_flags = NULL; + ring->vr_used_idx = NULL; + ring->vr_used_ring = NULL; + ring->vr_used_avail_event = NULL; +} + +void +viona_intr_ring(viona_vring_t *ring) +{ + uint64_t addr; + + mutex_enter(&ring->vr_lock); + /* Deliver the interrupt directly, if so configured. */ + if ((addr = ring->vr_msi_addr) != 0) { + uint64_t msg = ring->vr_msi_msg; + + mutex_exit(&ring->vr_lock); + (void) vmm_drv_msi(ring->vr_lease, addr, msg); + return; + } + mutex_exit(&ring->vr_lock); + + if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) { + pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND); + } +} + +static void +viona_worker(void *arg) +{ + viona_vring_t *ring = (viona_vring_t *)arg; + viona_link_t *link = ring->vr_link; + proc_t *p = ttoproc(curthread); + + mutex_enter(&ring->vr_lock); + VERIFY3U(ring->vr_state, ==, VRS_SETUP); + + /* Bail immediately if ring shutdown or process exit was requested */ + if (VRING_NEED_BAIL(ring, p)) { + goto cleanup; + } + + /* Report worker thread as alive and notify creator */ + ring->vr_state = VRS_INIT; + cv_broadcast(&ring->vr_cv); + + while (ring->vr_state_flags == 0) { + /* + * Keeping lease renewals timely while waiting for the ring to + * be started is important for avoiding deadlocks. + */ + if (vmm_drv_lease_expired(ring->vr_lease)) { + if (!viona_ring_lease_renew(ring)) { + goto cleanup; + } + } + + (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + + if (VRING_NEED_BAIL(ring, p)) { + goto cleanup; + } + } + + ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0); + ring->vr_state = VRS_RUN; + ring->vr_state_flags &= ~VRSF_REQ_START; + + /* Ensure ring lease is valid first */ + if (vmm_drv_lease_expired(ring->vr_lease)) { + if (!viona_ring_lease_renew(ring)) { + goto cleanup; + } + } + + /* Process actual work */ + if (ring == &link->l_vrings[VIONA_VQ_RX]) { + viona_worker_rx(ring, link); + } else if (ring == &link->l_vrings[VIONA_VQ_TX]) { + viona_worker_tx(ring, link); + } else { + panic("unexpected ring: %p", (void *)ring); + } + + VERIFY3U(ring->vr_state, ==, VRS_STOP); + +cleanup: + if (ring->vr_txdesb != NULL) { + /* + * Transmit activity must be entirely concluded before the + * associated descriptors can be cleaned up. + */ + VERIFY(ring->vr_xfer_outstanding == 0); + } + viona_ring_misc_free(ring); + + viona_ring_lease_drop(ring); + ring->vr_cur_aidx = 0; + ring->vr_state = VRS_RESET; + ring->vr_state_flags = 0; + ring->vr_worker_thread = NULL; + cv_broadcast(&ring->vr_cv); + mutex_exit(&ring->vr_lock); + + mutex_enter(&ttoproc(curthread)->p_lock); + lwp_exit(); +} + +static kthread_t * +viona_create_worker(viona_vring_t *ring) +{ + k_sigset_t hold_set; + proc_t *p = curproc; + kthread_t *t; + klwp_t *lwp; + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + ASSERT(ring->vr_state == VRS_RESET); + + sigfillset(&hold_set); + lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED, + minclsyspri - 1, &hold_set, curthread->t_cid, 0); + if (lwp == NULL) { + return (NULL); + } + + t = lwptot(lwp); + mutex_enter(&p->p_lock); + t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD; + lwp_create_done(t); + mutex_exit(&p->p_lock); + + return (t); +} + +int +vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov, + uint16_t *cookie) +{ + uint_t i, ndesc, idx, head, next; + struct virtio_desc vdir; + void *buf; + + ASSERT(iov != NULL); + ASSERT(niov > 0 && niov < INT_MAX); + + mutex_enter(&ring->vr_a_mutex); + idx = ring->vr_cur_aidx; + ndesc = (uint16_t)((unsigned)*ring->vr_avail_idx - (unsigned)idx); + + if (ndesc == 0) { + mutex_exit(&ring->vr_a_mutex); + return (0); + } + if (ndesc > ring->vr_size) { + /* + * Despite the fact that the guest has provided an 'avail_idx' + * which indicates that an impossible number of descriptors are + * available, continue on and attempt to process the next one. + * + * The transgression will not escape the probe or stats though. + */ + VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring, + uint16_t, ndesc); + VIONA_RING_STAT_INCR(ring, ndesc_too_high); + } + + head = ring->vr_avail_ring[idx & ring->vr_mask]; + next = head; + + for (i = 0; i < niov; next = vdir.vd_next) { + if (next >= ring->vr_size) { + VIONA_PROBE2(bad_idx, viona_vring_t *, ring, + uint16_t, next); + VIONA_RING_STAT_INCR(ring, bad_idx); + goto bail; + } + + vdir = ring->vr_descr[next]; + if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) { + if (vdir.vd_len == 0) { + VIONA_PROBE2(desc_bad_len, + viona_vring_t *, ring, + uint32_t, vdir.vd_len); + VIONA_RING_STAT_INCR(ring, desc_bad_len); + goto bail; + } + buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len); + if (buf == NULL) { + VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr); + VIONA_RING_STAT_INCR(ring, bad_ring_addr); + goto bail; + } + iov[i].iov_base = buf; + iov[i].iov_len = vdir.vd_len; + i++; + } else { + const uint_t nindir = vdir.vd_len / 16; + volatile struct virtio_desc *vindir; + + if ((vdir.vd_len & 0xf) || nindir == 0) { + VIONA_PROBE2(indir_bad_len, + viona_vring_t *, ring, + uint32_t, vdir.vd_len); + VIONA_RING_STAT_INCR(ring, indir_bad_len); + goto bail; + } + vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len); + if (vindir == NULL) { + VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr); + VIONA_RING_STAT_INCR(ring, bad_ring_addr); + goto bail; + } + next = 0; + for (;;) { + struct virtio_desc vp; + + /* + * A copy of the indirect descriptor is made + * here, rather than simply using a reference + * pointer. This prevents malicious or + * erroneous guest writes to the descriptor + * from fooling the flags/bounds verification + * through a race. + */ + vp = vindir[next]; + if (vp.vd_flags & VRING_DESC_F_INDIRECT) { + VIONA_PROBE1(indir_bad_nest, + viona_vring_t *, ring); + VIONA_RING_STAT_INCR(ring, + indir_bad_nest); + goto bail; + } else if (vp.vd_len == 0) { + VIONA_PROBE2(desc_bad_len, + viona_vring_t *, ring, + uint32_t, vp.vd_len); + VIONA_RING_STAT_INCR(ring, + desc_bad_len); + goto bail; + } + buf = viona_gpa2kva(ring, vp.vd_addr, + vp.vd_len); + if (buf == NULL) { + VIONA_PROBE_BAD_RING_ADDR(ring, + vp.vd_addr); + VIONA_RING_STAT_INCR(ring, + bad_ring_addr); + goto bail; + } + iov[i].iov_base = buf; + iov[i].iov_len = vp.vd_len; + i++; + + if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) + break; + if (i >= niov) { + goto loopy; + } + + next = vp.vd_next; + if (next >= nindir) { + VIONA_PROBE3(indir_bad_next, + viona_vring_t *, ring, + uint16_t, next, + uint_t, nindir); + VIONA_RING_STAT_INCR(ring, + indir_bad_next); + goto bail; + } + } + } + if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) { + *cookie = head; + ring->vr_cur_aidx++; + mutex_exit(&ring->vr_a_mutex); + return (i); + } + } + +loopy: + VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); + VIONA_RING_STAT_INCR(ring, too_many_desc); +bail: + mutex_exit(&ring->vr_a_mutex); + return (-1); +} + +void +vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie) +{ + volatile struct virtio_used *vu; + uint_t uidx; + + mutex_enter(&ring->vr_u_mutex); + + uidx = *ring->vr_used_idx; + vu = &ring->vr_used_ring[uidx++ & ring->vr_mask]; + vu->vu_idx = cookie; + vu->vu_tlen = len; + membar_producer(); + *ring->vr_used_idx = uidx; + + mutex_exit(&ring->vr_u_mutex); +} + +void +vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem) +{ + volatile struct virtio_used *vu; + uint_t uidx, i; + + mutex_enter(&ring->vr_u_mutex); + + uidx = *ring->vr_used_idx; + if (num_bufs == 1) { + vu = &ring->vr_used_ring[uidx++ & ring->vr_mask]; + vu->vu_idx = elem[0].id; + vu->vu_tlen = elem[0].len; + } else { + for (i = 0; i < num_bufs; i++) { + vu = &ring->vr_used_ring[(uidx + i) & ring->vr_mask]; + vu->vu_idx = elem[i].id; + vu->vu_tlen = elem[i].len; + } + uidx = uidx + num_bufs; + } + membar_producer(); + *ring->vr_used_idx = uidx; + + mutex_exit(&ring->vr_u_mutex); +} diff --git a/usr/src/uts/i86pc/io/viona/viona_rx.c b/usr/src/uts/i86pc/io/viona/viona_rx.c new file mode 100644 index 0000000000..1ccbaa63f1 --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_rx.c @@ -0,0 +1,718 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/strsubr.h> + +#include <sys/dlpi.h> +#include <sys/pattr.h> +#include <sys/vlan.h> + +#include "viona_impl.h" + + + +#define VTNET_MAXSEGS 32 + +/* Min. octets in an ethernet frame minus FCS */ +#define MIN_BUF_SIZE 60 +#define NEED_VLAN_PAD_SIZE (MIN_BUF_SIZE - VLAN_TAGSZ) + +static mblk_t *viona_vlan_pad_mp; + +void +viona_rx_init(void) +{ + mblk_t *mp; + + ASSERT(viona_vlan_pad_mp == NULL); + + /* Create mblk for padding when VLAN tags are stripped */ + mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL); + bzero(mp->b_rptr, VLAN_TAGSZ); + mp->b_wptr += VLAN_TAGSZ; + viona_vlan_pad_mp = mp; +} + +void +viona_rx_fini(void) +{ + mblk_t *mp; + + /* Clean up the VLAN padding mblk */ + mp = viona_vlan_pad_mp; + viona_vlan_pad_mp = NULL; + VERIFY(mp != NULL && mp->b_cont == NULL); + freemsg(mp); +} + +void +viona_worker_rx(viona_vring_t *ring, viona_link_t *link) +{ + proc_t *p = ttoproc(curthread); + + (void) thread_vsetname(curthread, "viona_rx_%p", ring); + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + ASSERT3U(ring->vr_state, ==, VRS_RUN); + + *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY; + + do { + if (vmm_drv_lease_expired(ring->vr_lease)) { + /* + * Set the renewal flag, causing incoming traffic to be + * dropped, and issue an RX barrier to ensure any + * threads in the RX callbacks will have finished. + * The vr_lock cannot be held across the barrier as it + * poses a deadlock risk. + */ + ring->vr_state_flags |= VRSF_RENEW; + mutex_exit(&ring->vr_lock); + mac_rx_barrier(link->l_mch); + mutex_enter(&ring->vr_lock); + + if (!viona_ring_lease_renew(ring)) { + break; + } + ring->vr_state_flags &= ~VRSF_RENEW; + } + + /* + * For now, there is little to do in the RX worker as inbound + * data is delivered by MAC via the RX callbacks. If tap-like + * functionality is added later, this would be a convenient + * place to inject frames into the guest. + */ + (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + } while (!VRING_NEED_BAIL(ring, p)); + + ring->vr_state = VRS_STOP; + + /* + * The RX ring is stopping, before we start tearing it down it + * is imperative that we perform an RX barrier so that + * incoming packets are dropped at viona_rx_classified(). + */ + mutex_exit(&ring->vr_lock); + mac_rx_barrier(link->l_mch); + mutex_enter(&ring->vr_lock); + + *ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY; +} + +static size_t +viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len, + boolean_t *end) +{ + size_t copied = 0; + size_t off = 0; + + /* Seek past already-consumed data */ + while (seek > 0 && mp != NULL) { + const size_t chunk = MBLKL(mp); + + if (chunk > seek) { + off = seek; + break; + } + mp = mp->b_cont; + seek -= chunk; + } + + while (mp != NULL) { + const size_t chunk = MBLKL(mp) - off; + const size_t to_copy = MIN(chunk, len); + + bcopy(mp->b_rptr + off, buf, to_copy); + copied += to_copy; + buf += to_copy; + len -= to_copy; + + /* + * If all the remaining data in the mblk_t was copied, move on + * to the next one in the chain. Any seek offset applied to + * the first mblk copy is zeroed out for subsequent operations. + */ + if (chunk == to_copy) { + mp = mp->b_cont; + off = 0; + } +#ifdef DEBUG + else { + /* + * The only valid reason for the copy to consume less + * than the entire contents of the mblk_t is because + * the output buffer has been filled. + */ + ASSERT0(len); + } +#endif + + /* Go no further if the buffer has been filled */ + if (len == 0) { + break; + } + + } + *end = (mp == NULL); + return (copied); +} + +static int +viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz) +{ + struct iovec iov[VTNET_MAXSEGS]; + uint16_t cookie; + int n; + const size_t hdr_sz = sizeof (struct virtio_net_hdr); + struct virtio_net_hdr *hdr; + size_t len, copied = 0; + caddr_t buf = NULL; + boolean_t end = B_FALSE; + const uint32_t features = ring->vr_link->l_features; + + ASSERT(msz >= MIN_BUF_SIZE); + + n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie); + if (n <= 0) { + /* Without available buffers, the frame must be dropped. */ + return (ENOSPC); + } + if (iov[0].iov_len < hdr_sz) { + /* + * There is little to do if there is not even space available + * for the sole header. Zero the buffer and bail out as a last + * act of desperation. + */ + bzero(iov[0].iov_base, iov[0].iov_len); + goto bad_frame; + } + + /* Grab the address of the header before anything else */ + hdr = (struct virtio_net_hdr *)iov[0].iov_base; + + /* + * If there is any space remaining in the first buffer after writing + * the header, fill it with frame data. + */ + if (iov[0].iov_len > hdr_sz) { + buf = (caddr_t)iov[0].iov_base + hdr_sz; + len = iov[0].iov_len - hdr_sz; + + copied += viona_copy_mblk(mp, copied, buf, len, &end); + } + + /* Copy any remaining data into subsequent buffers, if present */ + for (int i = 1; i < n && !end; i++) { + buf = (caddr_t)iov[i].iov_base; + len = iov[i].iov_len; + + copied += viona_copy_mblk(mp, copied, buf, len, &end); + } + + /* Was the expected amount of data copied? */ + if (copied != msz) { + VIONA_PROBE5(too_short, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp, size_t, copied, + size_t, msz); + VIONA_RING_STAT_INCR(ring, too_short); + goto bad_frame; + } + + /* Populate (read: zero) the header and account for it in the size */ + bzero(hdr, hdr_sz); + copied += hdr_sz; + + /* Add chksum bits, if needed */ + if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) { + uint32_t cksum_flags; + + if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) && + ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) { + hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4; + hdr->vrh_gso_size = DB_LSOMSS(mp); + } + + mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL, + &cksum_flags); + if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) { + hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID; + } + } + + /* Release this chain */ + vq_pushchain(ring, copied, cookie); + return (0); + +bad_frame: + VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie, + mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, bad_rx_frame); + + vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie); + return (EINVAL); +} + +static int +viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz) +{ + struct iovec iov[VTNET_MAXSEGS]; + used_elem_t uelem[VTNET_MAXSEGS]; + int n, i = 0, buf_idx = 0, err = 0; + uint16_t cookie; + caddr_t buf; + size_t len, copied = 0, chunk = 0; + struct virtio_net_mrgrxhdr *hdr = NULL; + const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr); + boolean_t end = B_FALSE; + const uint32_t features = ring->vr_link->l_features; + + ASSERT(msz >= MIN_BUF_SIZE); + + n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie); + if (n <= 0) { + /* Without available buffers, the frame must be dropped. */ + VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, no_space); + return (ENOSPC); + } + if (iov[0].iov_len < hdr_sz) { + /* + * There is little to do if there is not even space available + * for the sole header. Zero the buffer and bail out as a last + * act of desperation. + */ + bzero(iov[0].iov_base, iov[0].iov_len); + uelem[0].id = cookie; + uelem[0].len = iov[0].iov_len; + err = EINVAL; + goto done; + } + + /* Grab the address of the header and do initial population */ + hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base; + bzero(hdr, hdr_sz); + hdr->vrh_bufs = 1; + + /* + * If there is any space remaining in the first buffer after writing + * the header, fill it with frame data. + */ + if (iov[0].iov_len > hdr_sz) { + buf = iov[0].iov_base + hdr_sz; + len = iov[0].iov_len - hdr_sz; + + chunk += viona_copy_mblk(mp, copied, buf, len, &end); + copied += chunk; + } + i = 1; + + do { + while (i < n && !end) { + buf = iov[i].iov_base; + len = iov[i].iov_len; + + chunk += viona_copy_mblk(mp, copied, buf, len, &end); + copied += chunk; + i++; + } + + uelem[buf_idx].id = cookie; + uelem[buf_idx].len = chunk; + + /* + * Try to grab another buffer from the ring if the mblk has not + * yet been entirely copied out. + */ + if (!end) { + if (buf_idx == (VTNET_MAXSEGS - 1)) { + /* + * Our arbitrary limit on the number of buffers + * to offer for merge has already been reached. + */ + err = EOVERFLOW; + break; + } + n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie); + if (n <= 0) { + /* + * Without more immediate space to perform the + * copying, there is little choice left but to + * drop the packet. + */ + err = EMSGSIZE; + break; + } + chunk = 0; + i = 0; + buf_idx++; + /* + * Keep the header up-to-date with the number of + * buffers, but never reference its value since the + * guest could meddle with it. + */ + hdr->vrh_bufs++; + } + } while (!end && copied < msz); + + /* Account for the header size in the first buffer */ + uelem[0].len += hdr_sz; + + /* + * If no other errors were encounted during the copy, was the expected + * amount of data transfered? + */ + if (err == 0 && copied != msz) { + VIONA_PROBE5(too_short, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp, size_t, copied, + size_t, msz); + VIONA_RING_STAT_INCR(ring, too_short); + err = EINVAL; + } + + /* Add chksum bits, if needed */ + if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) { + uint32_t cksum_flags; + + if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) && + ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) { + hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4; + hdr->vrh_gso_size = DB_LSOMSS(mp); + } + + mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL, + &cksum_flags); + if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) { + hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID; + } + } + +done: + switch (err) { + case 0: + /* Success can fall right through to ring delivery */ + break; + + case EMSGSIZE: + VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, rx_merge_underrun); + break; + + case EOVERFLOW: + VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, rx_merge_overrun); + break; + + default: + VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, bad_rx_frame); + } + vq_pushchain_many(ring, buf_idx + 1, uelem); + return (err); +} + +static void +viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback) +{ + viona_link_t *link = ring->vr_link; + mblk_t *mprx = NULL, **mprx_prevp = &mprx; + mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop; + const boolean_t do_merge = + ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0); + + size_t nrx = 0, ndrop = 0; + + while (mp != NULL) { + mblk_t *next = mp->b_next; + mblk_t *pad = NULL; + size_t size = msgsize(mp); + int err = 0; + + mp->b_next = NULL; + + /* + * We treat both a 'drop' response and errors the same here + * and put the packet on the drop chain. As packets may be + * subject to different actions in ipf (which do not all + * return the same set of error values), an error processing + * one packet doesn't mean the next packet will also generate + * an error. + */ + if (VNETHOOK_INTERESTED_IN(link->l_neti) && + viona_hook(link, ring, &mp, B_FALSE) != 0) { + if (mp != NULL) { + *mpdrop_prevp = mp; + mpdrop_prevp = &mp->b_next; + } else { + /* + * If the hook consumer (e.g. ipf) already + * freed the mblk_t, update the drop count now. + */ + ndrop++; + } + mp = next; + continue; + } + + /* + * Ethernet frames are expected to be padded out in order to + * meet the minimum size. + * + * A special case is made for frames which are short by + * VLAN_TAGSZ, having been stripped of their VLAN tag while + * traversing MAC. A preallocated (and recycled) mblk is used + * for that specific condition. + * + * All other frames that fall short on length will have custom + * zero-padding allocated appended to them. + */ + if (size == NEED_VLAN_PAD_SIZE) { + ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ); + ASSERT(viona_vlan_pad_mp->b_cont == NULL); + + for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont) + ; + + pad->b_cont = viona_vlan_pad_mp; + size += VLAN_TAGSZ; + } else if (size < MIN_BUF_SIZE) { + const size_t pad_size = MIN_BUF_SIZE - size; + mblk_t *zero_mp; + + zero_mp = allocb(pad_size, BPRI_MED); + if (zero_mp == NULL) { + err = ENOMEM; + goto pad_drop; + } + + VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring, + mblk_t *, mp, size_t, pad_size); + VIONA_RING_STAT_INCR(ring, rx_pad_short); + zero_mp->b_wptr += pad_size; + bzero(zero_mp->b_rptr, pad_size); + linkb(mp, zero_mp); + size += pad_size; + } + + if (do_merge) { + err = viona_recv_merged(ring, mp, size); + } else { + err = viona_recv_plain(ring, mp, size); + } + + /* + * The VLAN padding mblk is meant for continual reuse, so + * remove it from the chain to prevent it from being freed. + * + * Custom allocated padding does not require this treatment and + * is freed normally. + */ + if (pad != NULL) { + pad->b_cont = NULL; + } + +pad_drop: + /* + * While an error during rx processing + * (viona_recv_{merged,plain}) does not free mp on error, + * hook processing might or might not free mp. Handle either + * scenario -- if mp is not yet free, it is queued up and + * freed after the guest has been notified. If mp is + * already NULL, just proceed on. + */ + if (err != 0) { + *mpdrop_prevp = mp; + mpdrop_prevp = &mp->b_next; + + /* + * If the available ring is empty, do not bother + * attempting to deliver any more frames. Count the + * rest as dropped too. + */ + if (err == ENOSPC) { + mp->b_next = next; + break; + } + } else { + /* Chain successful mblks to be freed later */ + *mprx_prevp = mp; + mprx_prevp = &mp->b_next; + nrx++; + } + mp = next; + } + + membar_enter(); + if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + viona_intr_ring(ring); + } + + /* Free successfully received frames */ + if (mprx != NULL) { + freemsgchain(mprx); + } + + /* Free dropped frames, also tallying them */ + mp = mpdrop; + while (mp != NULL) { + mblk_t *next = mp->b_next; + + mp->b_next = NULL; + freemsg(mp); + mp = next; + ndrop++; + } + VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop); +} + +static void +viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t is_loopback) +{ + viona_vring_t *ring = (viona_vring_t *)arg; + + /* Drop traffic if ring is inactive or renewing its lease */ + if (ring->vr_state != VRS_RUN || + (ring->vr_state_flags & VRSF_RENEW) != 0) { + freemsgchain(mp); + return; + } + + viona_rx_common(ring, mp, is_loopback); +} + +static void +viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t is_loopback) +{ + viona_vring_t *ring = (viona_vring_t *)arg; + mac_handle_t mh = ring->vr_link->l_mh; + mblk_t *mp_mcast_only = NULL; + mblk_t **mpp = &mp_mcast_only; + + /* Drop traffic if ring is inactive or renewing its lease */ + if (ring->vr_state != VRS_RUN || + (ring->vr_state_flags & VRSF_RENEW) != 0) { + freemsgchain(mp); + return; + } + + /* + * In addition to multicast traffic, broadcast packets will also arrive + * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback + * for fully-classified traffic has already delivered that broadcast + * traffic, so it should be suppressed here, rather than duplicating it + * to the guest. + */ + while (mp != NULL) { + mblk_t *mp_next; + mac_header_info_t mhi; + int err; + + mp_next = mp->b_next; + mp->b_next = NULL; + + /* Determine the packet type */ + err = mac_vlan_header_info(mh, mp, &mhi); + if (err != 0) { + mblk_t *pull; + + /* + * It is possible that gathering of the header + * information was impeded by a leading mblk_t which + * was of inadequate length to reference the needed + * fields. Try again, in case that could be solved + * with a pull-up. + */ + pull = msgpullup(mp, sizeof (struct ether_vlan_header)); + if (pull == NULL) { + err = ENOMEM; + } else { + err = mac_vlan_header_info(mh, pull, &mhi); + freemsg(pull); + } + + if (err != 0) { + VIONA_RING_STAT_INCR(ring, rx_mcast_check); + } + } + + /* Chain up matching packets while discarding others */ + if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) { + *mpp = mp; + mpp = &mp->b_next; + } else { + freemsg(mp); + } + + mp = mp_next; + } + + if (mp_mcast_only != NULL) { + viona_rx_common(ring, mp_mcast_only, is_loopback); + } +} + +int +viona_rx_set(viona_link_t *link) +{ + viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX]; + int err; + + mac_rx_set(link->l_mch, viona_rx_classified, ring); + err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI, + viona_rx_mcast, ring, &link->l_mph, + MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP); + if (err != 0) { + mac_rx_clear(link->l_mch); + } + + return (err); +} + +void +viona_rx_clear(viona_link_t *link) +{ + mac_promisc_remove(link->l_mph); + mac_rx_clear(link->l_mch); +} diff --git a/usr/src/uts/i86pc/io/viona/viona_tx.c b/usr/src/uts/i86pc/io/viona/viona_tx.c new file mode 100644 index 0000000000..5dc645723c --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_tx.c @@ -0,0 +1,756 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + + +#include <sys/types.h> +#include <sys/smt.h> +#include <sys/strsubr.h> + +#include <sys/pattr.h> +#include <sys/dlpi.h> +#include <inet/ip.h> +#include <inet/ip_impl.h> + +#include "viona_impl.h" + +#define BNXE_NIC_DRIVER "bnxe" + +/* + * copy tx mbufs from virtio ring to avoid necessitating a wait for packet + * transmission to free resources. + */ +kmutex_t viona_force_copy_lock; +static enum viona_force_copy { + VFC_UNINITALIZED = 0, + VFC_COPY_UNEEDED = 1, + VFC_COPY_REQUIRED = 2, +} viona_force_copy_state = VFC_UNINITALIZED; + +struct viona_desb { + frtn_t d_frtn; + viona_vring_t *d_ring; + uint_t d_ref; + uint32_t d_len; + uint16_t d_cookie; + uchar_t *d_headers; +}; + +static void viona_tx(viona_link_t *, viona_vring_t *); +static void viona_desb_release(viona_desb_t *); + +/* + * Return the number of available descriptors in the vring taking care of the + * 16-bit index wraparound. + * + * Note: If the number of apparently available descriptors is larger than the + * ring size (due to guest misbehavior), this check will still report the + * positive count of descriptors. + */ +static inline uint_t +viona_vr_num_avail(viona_vring_t *ring) +{ + uint16_t ndesc; + + /* + * We're just computing (a-b) in GF(216). + * + * The only glitch here is that in standard C, uint16_t promotes to + * (signed) int when int has more than 16 bits (almost always now). + * A cast back to unsigned is necessary for proper operation. + */ + ndesc = (unsigned)*ring->vr_avail_idx - (unsigned)ring->vr_cur_aidx; + + return (ndesc); +} + +static void +viona_tx_wait_outstanding(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + while (ring->vr_xfer_outstanding != 0) { + /* + * Paying heed to signals is counterproductive here. This is a + * very tight loop if pending transfers take an extended amount + * of time to be reclaimed while the host process is exiting. + */ + cv_wait(&ring->vr_cv, &ring->vr_lock); + } +} + +/* + * Check if full TX packet copying is needed. This should not be called from + * viona attach()/detach() context. + */ +static boolean_t +viona_tx_copy_needed(void) +{ + boolean_t result; + + mutex_enter(&viona_force_copy_lock); + if (viona_force_copy_state == VFC_UNINITALIZED) { + major_t bnxe_major; + + /* + * The original code for viona featured an explicit check for + * the bnxe driver which, when found present, necessitated that + * all transmissions be copied into their own mblks instead of + * passing guest memory to the underlying device. + * + * The motivations for this are unclear, but until it can be + * proven unnecessary, the check lives on. + */ + viona_force_copy_state = VFC_COPY_UNEEDED; + if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER)) + != DDI_MAJOR_T_NONE) { + if (ddi_hold_installed_driver(bnxe_major) != NULL) { + viona_force_copy_state = VFC_COPY_REQUIRED; + ddi_rele_driver(bnxe_major); + } + } + } + result = (viona_force_copy_state == VFC_COPY_REQUIRED); + mutex_exit(&viona_force_copy_lock); + + return (result); +} + +void +viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz) +{ + /* Allocate desb handles for TX ring if packet copying not disabled */ + if (!viona_tx_copy_needed()) { + viona_desb_t *dp; + + dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP); + ring->vr_txdesb = dp; + for (uint_t i = 0; i < qsz; i++, dp++) { + dp->d_frtn.free_func = viona_desb_release; + dp->d_frtn.free_arg = (void *)dp; + dp->d_ring = ring; + dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN, + KM_SLEEP); + } + } + + /* Allocate ring-sized iovec buffers for TX */ + ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP); +} + +void +viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz) +{ + if (ring->vr_txdesb != NULL) { + viona_desb_t *dp = ring->vr_txdesb; + + for (uint_t i = 0; i < qsz; i++, dp++) { + kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN); + } + kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz); + ring->vr_txdesb = NULL; + } + + if (ring->vr_txiov != NULL) { + kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz); + ring->vr_txiov = NULL; + } +} + +static void +viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie) +{ + vq_pushchain(ring, len, cookie); + + membar_enter(); + if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + viona_intr_ring(ring); + } +} + +void +viona_worker_tx(viona_vring_t *ring, viona_link_t *link) +{ + proc_t *p = ttoproc(curthread); + + (void) thread_vsetname(curthread, "viona_tx_%p", ring); + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + ASSERT3U(ring->vr_state, ==, VRS_RUN); + + mutex_exit(&ring->vr_lock); + + for (;;) { + boolean_t bail = B_FALSE; + boolean_t renew = B_FALSE; + uint_t ntx = 0; + + *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY; + while (viona_vr_num_avail(ring)) { + viona_tx(link, ring); + + /* + * It is advantageous for throughput to keep this + * transmission loop tight, but periodic breaks to + * check for other events are of value too. + */ + if (ntx++ >= ring->vr_size) + break; + } + *ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY; + + VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx); + + /* + * Check for available descriptors on the ring once more in + * case a late addition raced with the NO_NOTIFY flag toggle. + * + * The barrier ensures that visibility of the vr_used_flags + * store does not cross the viona_vr_num_avail() check below. + */ + membar_enter(); + bail = VRING_NEED_BAIL(ring, p); + renew = vmm_drv_lease_expired(ring->vr_lease); + if (!bail && !renew && viona_vr_num_avail(ring)) { + continue; + } + + if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) { + viona_intr_ring(ring); + } + + mutex_enter(&ring->vr_lock); + + while (!bail && !renew && !viona_vr_num_avail(ring)) { + (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + bail = VRING_NEED_BAIL(ring, p); + renew = vmm_drv_lease_expired(ring->vr_lease); + } + + if (bail) { + break; + } else if (renew) { + ring->vr_state_flags |= VRSF_RENEW; + /* + * When renewing the lease for the ring, no TX + * frames may be outstanding, as they contain + * references to guest memory. + */ + viona_tx_wait_outstanding(ring); + + if (!viona_ring_lease_renew(ring)) { + break; + } + ring->vr_state_flags &= ~VRSF_RENEW; + } + mutex_exit(&ring->vr_lock); + } + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + ring->vr_state = VRS_STOP; + viona_tx_wait_outstanding(ring); +} + +static void +viona_desb_release(viona_desb_t *dp) +{ + viona_vring_t *ring = dp->d_ring; + uint_t ref; + uint32_t len; + uint16_t cookie; + + ref = atomic_dec_uint_nv(&dp->d_ref); + if (ref > 1) { + return; + } + + /* + * The desb corresponding to this index must be ready for reuse before + * the descriptor is returned to the guest via the 'used' ring. + */ + len = dp->d_len; + cookie = dp->d_cookie; + dp->d_len = 0; + dp->d_cookie = 0; + dp->d_ref = 0; + + viona_tx_done(ring, len, cookie); + + mutex_enter(&ring->vr_lock); + if ((--ring->vr_xfer_outstanding) == 0) { + cv_broadcast(&ring->vr_cv); + } + mutex_exit(&ring->vr_lock); +} + +static boolean_t +viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, + mblk_t *mp, uint32_t len) +{ + viona_link_t *link = ring->vr_link; + const struct ether_header *eth; + uint_t eth_len = sizeof (struct ether_header); + ushort_t ftype; + ipha_t *ipha = NULL; + uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */ + uint16_t flags = 0; + const uint_t csum_start = hdr->vrh_csum_start; + const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start; + + /* + * Validate that the checksum offsets provided by the guest are within + * the bounds of the packet. Additionally, ensure that the checksum + * contents field is within the headers mblk copied by viona_tx(). + */ + if (csum_start >= len || csum_start < eth_len || csum_stuff >= len || + (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) { + VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum); + return (B_FALSE); + } + + /* + * This is guaranteed to be safe thanks to the header copying + * done in viona_tx(). + */ + eth = (const struct ether_header *)mp->b_rptr; + ftype = ntohs(eth->ether_type); + + if (ftype == ETHERTYPE_VLAN) { + const struct ether_vlan_header *veth; + + /* punt on QinQ for now */ + eth_len = sizeof (struct ether_vlan_header); + veth = (const struct ether_vlan_header *)eth; + ftype = ntohs(veth->ether_type); + } + + if (ftype == ETHERTYPE_IP) { + ipha = (ipha_t *)(mp->b_rptr + eth_len); + + ipproto = ipha->ipha_protocol; + } else if (ftype == ETHERTYPE_IPV6) { + ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len); + + ipproto = ip6h->ip6_nxt; + } + + /* + * We ignore hdr_len because the spec says it can't be + * trusted. Besides, our own stack will determine the header + * boundary. + */ + if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && + (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 && + ftype == ETHERTYPE_IP) { + uint16_t *cksump; + uint32_t cksum; + ipaddr_t src = ipha->ipha_src; + ipaddr_t dst = ipha->ipha_dst; + + /* + * Our native IP stack doesn't set the L4 length field + * of the pseudo header when LSO is in play. Other IP + * stacks, e.g. Linux, do include the length field. + * This is a problem because the hardware expects that + * the length field is not set. When it is set it will + * cause an incorrect TCP checksum to be generated. + * The reason this works in Linux is because Linux + * corrects the pseudo-header checksum in the driver + * code. In order to get the correct HW checksum we + * need to assume the guest's IP stack gave us a bogus + * TCP partial checksum and calculate it ourselves. + */ + cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha)); + cksum = IP_TCP_CSUM_COMP; + cksum += (dst >> 16) + (dst & 0xFFFF) + + (src >> 16) + (src & 0xFFFF); + cksum = (cksum & 0xFFFF) + (cksum >> 16); + *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); + + /* + * Since viona is a "legacy device", the data stored + * by the driver will be in the guest's native endian + * format (see sections 2.4.3 and 5.1.6.1 of the + * VIRTIO 1.0 spec for more info). At this time the + * only guests using viona are x86 and we can assume + * little-endian. + */ + lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO); + + /* + * Hardware, like ixgbe, expects the client to request + * IP header checksum offload if it's sending LSO (see + * ixgbe_get_context()). Unfortunately, virtio makes + * no allowances for negotiating IP header checksum + * and HW offload, only TCP checksum. We add the flag + * and zero-out the checksum field. This mirrors the + * behavior of our native IP stack (which does this in + * the interest of HW that expects the field to be + * zero). + */ + flags |= HCK_IPV4_HDRCKSUM; + ipha->ipha_hdr_checksum = 0; + } + + /* + * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure + * HW_LSO, if present, is not lost. + */ + flags |= DB_CKSUMFLAGS(mp); + + /* + * Partial checksum support from the NIC is ideal, since it most + * closely maps to the interface defined by virtio. + */ + if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && + (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { + /* + * MAC expects these offsets to be relative to the + * start of the L3 header rather than the L2 frame. + */ + flags |= HCK_PARTIALCKSUM; + mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len, + len - eth_len, 0, flags); + return (B_TRUE); + } + + /* + * Without partial checksum support, look to the L3/L4 protocol + * information to see if the NIC can handle it. If not, the + * checksum will need to calculated inline. + */ + if (ftype == ETHERTYPE_IP) { + if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 && + (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { + uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); + *csump = 0; + flags |= HCK_FULLCKSUM; + mac_hcksum_set(mp, 0, 0, 0, 0, flags); + return (B_TRUE); + } + + /* XXX: Implement manual fallback checksumming? */ + VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum); + return (B_FALSE); + } else if (ftype == ETHERTYPE_IPV6) { + if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 && + (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { + uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); + *csump = 0; + flags |= HCK_FULLCKSUM; + mac_hcksum_set(mp, 0, 0, 0, 0, flags); + return (B_TRUE); + } + + /* XXX: Implement manual fallback checksumming? */ + VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum6); + return (B_FALSE); + } + + /* Cannot even emulate hcksum for unrecognized protocols */ + VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum_proto); + return (B_FALSE); +} + +static void +viona_tx(viona_link_t *link, viona_vring_t *ring) +{ + struct iovec *iov = ring->vr_txiov; + const uint_t max_segs = ring->vr_size; + uint16_t cookie; + int i, n; + uint32_t len, base_off = 0; + uint32_t min_copy = VIONA_MAX_HDRS_LEN; + mblk_t *mp_head, *mp_tail, *mp; + viona_desb_t *dp = NULL; + mac_client_handle_t link_mch = link->l_mch; + const struct virtio_net_hdr *hdr; + + mp_head = mp_tail = NULL; + + ASSERT(iov != NULL); + + n = vq_popchain(ring, iov, max_segs, &cookie); + if (n == 0) { + VIONA_PROBE1(tx_absent, viona_vring_t *, ring); + VIONA_RING_STAT_INCR(ring, tx_absent); + return; + } else if (n < 0) { + /* + * Any error encountered in vq_popchain has already resulted in + * specific probe and statistic handling. Further action here + * is unnecessary. + */ + return; + } + + /* Grab the header and ensure it is of adequate length */ + hdr = (const struct virtio_net_hdr *)iov[0].iov_base; + len = iov[0].iov_len; + if (len < sizeof (struct virtio_net_hdr)) { + goto drop_fail; + } + + /* Make sure the packet headers are always in the first mblk. */ + if (ring->vr_txdesb != NULL) { + dp = &ring->vr_txdesb[cookie]; + + /* + * If the guest driver is operating properly, each desb slot + * should be available for use when processing a TX descriptor + * from the 'avail' ring. In the case of drivers that reuse a + * descriptor before it has been posted to the 'used' ring, the + * data is simply dropped. + */ + if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) { + dp = NULL; + goto drop_fail; + } + + dp->d_cookie = cookie; + mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0, + &dp->d_frtn); + + /* Account for the successful desballoc. */ + if (mp_head != NULL) + dp->d_ref++; + } else { + mp_head = allocb(VIONA_MAX_HDRS_LEN, 0); + } + + if (mp_head == NULL) + goto drop_fail; + + mp_tail = mp_head; + + /* + * We always copy enough of the guest data to cover the + * headers. This protects us from TOCTOU attacks and allows + * message block length assumptions to be made in subsequent + * code. In many cases, this means copying more data than + * strictly necessary. That's okay, as it is the larger packets + * (such as LSO) that really benefit from desballoc(). + */ + for (i = 1; i < n; i++) { + const uint32_t to_copy = MIN(min_copy, iov[i].iov_len); + + bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy); + mp_head->b_wptr += to_copy; + len += to_copy; + min_copy -= to_copy; + + /* + * We've met the minimum copy requirement. The rest of + * the guest data can be referenced. + */ + if (min_copy == 0) { + /* + * If we copied all contents of this + * descriptor then move onto the next one. + * Otherwise, record how far we are into the + * current descriptor. + */ + if (iov[i].iov_len == to_copy) + i++; + else + base_off = to_copy; + + break; + } + } + + ASSERT3P(mp_head, !=, NULL); + ASSERT3P(mp_tail, !=, NULL); + + for (; i < n; i++) { + uintptr_t base = (uintptr_t)iov[i].iov_base + base_off; + uint32_t chunk = iov[i].iov_len - base_off; + + ASSERT3U(base_off, <, iov[i].iov_len); + ASSERT3U(chunk, >, 0); + + if (dp != NULL) { + mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn); + if (mp == NULL) { + goto drop_fail; + } + dp->d_ref++; + } else { + mp = allocb(chunk, BPRI_MED); + if (mp == NULL) { + goto drop_fail; + } + bcopy((uchar_t *)base, mp->b_wptr, chunk); + } + + base_off = 0; + len += chunk; + mp->b_wptr += chunk; + mp_tail->b_cont = mp; + mp_tail = mp; + } + + if (VNETHOOK_INTERESTED_OUT(link->l_neti)) { + /* + * The hook consumer may elect to free the mblk_t and set + * our mblk_t ** to NULL. When using a viona_desb_t + * (dp != NULL), we do not want the corresponding cleanup to + * occur during the viona_hook() call. We instead want to + * reset and recycle dp for future use. To prevent cleanup + * during the viona_hook() call, we take a ref on dp (if being + * used), and release it on success. On failure, the + * freemsgchain() call will release all the refs taken earlier + * in viona_tx() (aside from the initial ref and the one we + * take), and drop_hook will reset dp for reuse. + */ + if (dp != NULL) + dp->d_ref++; + + /* + * Pass &mp instead of &mp_head so we don't lose track of + * mp_head if the hook consumer (i.e. ipf) elects to free mp + * and set mp to NULL. + */ + mp = mp_head; + if (viona_hook(link, ring, &mp, B_TRUE) != 0) { + if (mp != NULL) + freemsgchain(mp); + goto drop_hook; + } + + if (dp != NULL) { + dp->d_ref--; + + /* + * It is possible that the hook(s) accepted the packet, + * but as part of its processing, it issued a pull-up + * which released all references to the desb. In that + * case, go back to acting like the packet is entirely + * copied (which it is). + */ + if (dp->d_ref == 1) { + dp->d_cookie = 0; + dp->d_ref = 0; + dp = NULL; + } + } + } + + /* + * Request hardware checksumming, if necessary. If the guest + * sent an LSO packet then it must have also negotiated and + * requested partial checksum; therefore the LSO logic is + * contained within viona_tx_csum(). + */ + if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 && + (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) { + if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) { + goto drop_fail; + } + } + + if (dp != NULL) { + dp->d_len = len; + mutex_enter(&ring->vr_lock); + ring->vr_xfer_outstanding++; + mutex_exit(&ring->vr_lock); + } else { + /* + * If the data was cloned out of the ring, the descriptors can + * be marked as 'used' now, rather than deferring that action + * until after successful packet transmission. + */ + viona_tx_done(ring, len, cookie); + } + + /* + * We're potentially going deep into the networking layer; make sure the + * guest can't run concurrently. + */ + smt_begin_unsafe(); + mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); + smt_end_unsafe(); + return; + +drop_fail: + /* + * On the off chance that memory is not available via the desballoc or + * allocb calls, there are few options left besides to fail and drop + * the frame on the floor. + */ + + if (dp != NULL) { + /* + * Take an additional reference on the desb handle (if present) + * so any desballoc-sourced mblks can release their hold on it + * without the handle reaching its final state and executing + * its clean-up logic. + */ + dp->d_ref++; + } + + /* + * Free any already-allocated blocks and sum up the total length of the + * dropped data to be released to the used ring. + */ + freemsgchain(mp_head); + +drop_hook: + len = 0; + for (uint_t i = 0; i < n; i++) { + len += iov[i].iov_len; + } + + if (dp != NULL) { + VERIFY(dp->d_ref == 2); + + /* Clean up the desb handle, releasing the extra hold. */ + dp->d_len = 0; + dp->d_cookie = 0; + dp->d_ref = 0; + } + + VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len, + uint16_t, cookie); + viona_tx_done(ring, len, cookie); +} diff --git a/usr/src/uts/i86pc/io/vmm/README.sync b/usr/src/uts/i86pc/io/vmm/README.sync new file mode 100644 index 0000000000..1cddfd829e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/README.sync @@ -0,0 +1,18 @@ +The bhyve kernel module and its associated userland consumers have been updated +to the latest upstream FreeBSD sources as of: + + +commit 3b9cb80b242682690203709aaff4eafae41c138f +Author: jhb <jhb@FreeBSD.org> +Date: Mon Jun 3 23:17:35 2019 +0000 + + Emulate the AMD MSR_LS_CFG MSR used for various Ryzen errata. + + Writes are ignored and reads always return zero. + + Submitted by: José Albornoz <jojo@eljojo.net> (write-only version) + Reviewed by: Patrick Mooney, cem + MFC after: 2 weeks + Differential Revision: https://reviews.freebsd.org/D19506 + +Which corresponds to SVN revision: 348592 diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdv.c b/usr/src/uts/i86pc/io/vmm/amd/amdv.c new file mode 100644 index 0000000000..c34a1e897b --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/amdv.c @@ -0,0 +1,148 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> + +#include <machine/vmm.h> +#include "io/iommu.h" + +static int +amd_iommu_init(void) +{ + + printf("amd_iommu_init: not implemented\n"); + return (ENXIO); +} + +static void +amd_iommu_cleanup(void) +{ + + printf("amd_iommu_cleanup: not implemented\n"); +} + +static void +amd_iommu_enable(void) +{ + + printf("amd_iommu_enable: not implemented\n"); +} + +static void +amd_iommu_disable(void) +{ + + printf("amd_iommu_disable: not implemented\n"); +} + +static void * +amd_iommu_create_domain(vm_paddr_t maxaddr) +{ + + printf("amd_iommu_create_domain: not implemented\n"); + return (NULL); +} + +static void +amd_iommu_destroy_domain(void *domain) +{ + + printf("amd_iommu_destroy_domain: not implemented\n"); +} + +static uint64_t +amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + + printf("amd_iommu_create_mapping: not implemented\n"); + return (0); +} + +static uint64_t +amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len) +{ + + printf("amd_iommu_remove_mapping: not implemented\n"); + return (0); +} + +static void +amd_iommu_add_device(void *domain, uint16_t rid) +{ + + printf("amd_iommu_add_device: not implemented\n"); +} + +static void +amd_iommu_remove_device(void *domain, uint16_t rid) +{ + + printf("amd_iommu_remove_device: not implemented\n"); +} + +static void +amd_iommu_invalidate_tlb(void *domain) +{ + + printf("amd_iommu_invalidate_tlb: not implemented\n"); +} + +struct iommu_ops iommu_ops_amd = { + amd_iommu_init, + amd_iommu_cleanup, + amd_iommu_enable, + amd_iommu_disable, + amd_iommu_create_domain, + amd_iommu_destroy_domain, + amd_iommu_create_mapping, + amd_iommu_remove_mapping, + amd_iommu_add_device, + amd_iommu_remove_device, + amd_iommu_invalidate_tlb, +}; diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c new file mode 100644 index 0000000000..f6b6e60363 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c @@ -0,0 +1,1461 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/rman.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/resource.h> +#include <machine/vmm.h> +#include <machine/pmap.h> +#include <machine/vmparam.h> +#include <machine/pci_cfgreg.h> + +#include "pcib_if.h" + +#include "io/iommu.h" +#include "amdvi_priv.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW, NULL, NULL); + +#define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s))) +#define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s))) + +/* Print RID or device ID in PCI string format. */ +#define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d) + +static void amdvi_dump_cmds(struct amdvi_softc *softc); +static void amdvi_print_dev_cap(struct amdvi_softc *softc); + +MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi"); + +extern device_t *ivhd_devs; + +extern int ivhd_count; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count, + 0, NULL); + +static int amdvi_enable_user = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN, + &amdvi_enable_user, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user); + +#ifdef AMDVI_ATS_ENABLE +/* XXX: ATS is not tested. */ +static int amdvi_enable_iotlb = 1; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN, + &amdvi_enable_iotlb, 0, NULL); +TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb); +#endif + +static int amdvi_host_ptp = 1; /* Use page tables for host. */ +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN, + &amdvi_host_ptp, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp); + +/* Page table level used <= supported by h/w[v1=7]. */ +static int amdvi_ptp_level = 4; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN, + &amdvi_ptp_level, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level); + +/* Disable fault event reporting. */ +static int amdvi_disable_io_fault = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN, + &amdvi_disable_io_fault, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault); + +static uint32_t amdvi_dom_id = 0; /* 0 is reserved for host. */ +SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD, + &amdvi_dom_id, 0, NULL); +/* + * Device table entry. + * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes). + * = 256 * 2 * PAGE_SIZE. + */ +static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE); +CTASSERT(PCI_NUM_DEV_MAX == 0x10000); +CTASSERT(sizeof(amdvi_dte) == 0x200000); + +static SLIST_HEAD (, amdvi_domain) dom_head; + +static inline uint32_t +amdvi_pci_read(struct amdvi_softc *softc, int off) +{ + + return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid), + PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), + off, 4)); +} + +#ifdef AMDVI_ATS_ENABLE +/* XXX: Should be in pci.c */ +/* + * Check if device has ATS capability and its enabled. + * If ATS is absent or disabled, return (-1), otherwise ATS + * queue length. + */ +static int +amdvi_find_ats_qlen(uint16_t devid) +{ + device_t dev; + uint32_t off, cap; + int qlen = -1; + + dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid), + PCI_RID2FUNC(devid)); + + if (!dev) { + return (-1); + } +#define PCIM_ATS_EN BIT(31) + + if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) { + cap = pci_read_config(dev, off + 4, 4); + qlen = (cap & 0x1F); + qlen = qlen ? qlen : 32; + printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n", + RID2PCI_STR(devid), + (cap & PCIM_ATS_EN) ? "enabled" : "Disabled", + qlen); + qlen = (cap & PCIM_ATS_EN) ? qlen : -1; + } + + return (qlen); +} + +/* + * Check if an endpoint device support device IOTLB or ATS. + */ +static inline bool +amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct ivhd_dev_cfg *cfg; + int qlen, i; + bool pci_ats, ivhd_ats; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) + return (false); + + KASSERT(softc, ("softc is NULL")); + cfg = softc->dev_cfg; + + ivhd_ats = false; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) { + ivhd_ats = cfg->enable_ats; + break; + } + cfg++; + } + + pci_ats = (qlen < 0) ? false : true; + if (pci_ats != ivhd_ats) + device_printf(softc->dev, + "BIOS bug: mismatch in ATS setting for %d.%d.%d," + "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen); + + /* Ignore IVRS setting and respect PCI setting. */ + return (pci_ats); +} +#endif + +/* Enable IOTLB support for IOMMU if its supported. */ +static inline void +amdvi_hw_enable_iotlb(struct amdvi_softc *softc) +{ +#ifndef AMDVI_ATS_ENABLE + softc->iotlb = false; +#else + bool supported; + + supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false; + + if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) { + if (!supported) + device_printf(softc->dev, "IOTLB disabled by BIOS.\n"); + + if (supported && !amdvi_enable_iotlb) { + device_printf(softc->dev, "IOTLB disabled by user.\n"); + supported = false; + } + } else + supported = false; + + softc->iotlb = supported; + +#endif +} + +static int +amdvi_init_cmd(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl = softc->ctrl; + + ctrl->cmd.len = 8; /* Use 256 command buffer entries. */ + softc->cmd_max = 1 << ctrl->cmd.len; + + softc->cmd = malloc(sizeof(struct amdvi_cmd) * + softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO); + + if ((uintptr_t)softc->cmd & PAGE_MASK) + panic("AMDVi: Command buffer not aligned on page boundary."); + + ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE; + /* + * XXX: Reset the h/w pointers in case IOMMU is restarting, + * h/w doesn't clear these pointers based on empirical data. + */ + ctrl->cmd_tail = 0; + ctrl->cmd_head = 0; + + return (0); +} + +/* + * Note: Update tail pointer after we have written the command since tail + * pointer update cause h/w to execute new commands, see section 3.3 + * of AMD IOMMU spec ver 2.0. + */ +/* Get the command tail pointer w/o updating it. */ +static struct amdvi_cmd * +amdvi_get_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *tail; + + KASSERT(softc, ("softc is NULL")); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd + + ctrl->cmd_tail); + + return (tail); +} + +/* + * Update the command tail pointer which will start command execution. + */ +static void +amdvi_update_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int size; + + size = sizeof(struct amdvi_cmd); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max); + softc->total_cmd++; + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n", + ctrl->cmd_tail, + ctrl->cmd_head); +#endif + +} + +/* + * Various commands supported by IOMMU. + */ + +/* Completion wait command. */ +static void +amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data) +{ + struct amdvi_cmd *cmd; + uint64_t pa; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + pa = vtophys(&softc->cmp_data); + cmd->opcode = AMDVI_CMP_WAIT_OPCODE; + cmd->word0 = (pa & 0xFFFFFFF8) | + (AMDVI_CMP_WAIT_STORE); + //(AMDVI_CMP_WAIT_FLUSH | AMDVI_CMP_WAIT_STORE); + cmd->word1 = (pa >> 32) & 0xFFFFF; + cmd->addr = data; + + amdvi_update_cmd_tail(softc); +} + +/* Invalidate device table entry. */ +static void +amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_DTE_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid); +#endif +} + +/* Invalidate IOMMU page, use for invalidation of domain. */ +static void +amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, + uint64_t addr, bool guest_nested, + bool pde, bool page) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + + cmd->opcode = AMDVI_INVD_PAGE_OPCODE; + cmd->word1 = domain_id; + /* + * Invalidate all addresses for this domain. + */ + cmd->addr = addr; + cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0; + cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0; + + amdvi_update_cmd_tail(softc); +} + +#ifdef AMDVI_ATS_ENABLE +/* Invalidate device IOTLB. */ +static void +amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + int qlen; + + if (!softc->iotlb) + return; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) { + panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n", + qlen, RID2PCI_STR(devid)); + } + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate IOTLB devID 0x%x" + " Qlen:%d\n", devid, qlen); +#endif + cmd->opcode = AMDVI_INVD_IOTLB_OPCODE; + cmd->word0 = devid; + cmd->word1 = qlen; + cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR | + AMDVI_INVD_IOTLB_S; + amdvi_update_cmd_tail(softc); +} +#endif + +#ifdef notyet /* For Interrupt Remap. */ +static void +amdvi_cmd_inv_intr_map(struct amdvi_softc *softc, + uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_INTR_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid); +#endif +} +#endif + +/* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */ +static void +amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + /* + * See section 3.3.3 of IOMMU spec rev 2.0, software note + * for invalidating domain. + */ + amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR, + false, true, true); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id); + +#endif +} + +static bool +amdvi_cmp_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + const uint64_t VERIFY = 0xA5A5; + volatile uint64_t *read; + int i; + bool status; + + ctrl = softc->ctrl; + read = &softc->cmp_data; + *read = 0; + amdvi_cmd_cmp(softc, VERIFY); + /* Wait for h/w to update completion data. */ + for (i = 0; i < 100 && (*read != VERIFY); i++) { + DELAY(1000); /* 1 ms */ + } + status = (VERIFY == softc->cmp_data) ? true : false; + +#ifdef AMDVI_DEBUG_CMD + if (status) + device_printf(softc->dev, "CMD completion DONE Tail:0x%x, " + "Head:0x%x, loop:%d.\n", ctrl->cmd_tail, + ctrl->cmd_head, loop); +#endif + return (status); +} + +static void +amdvi_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int i; + + KASSERT(softc, ("softc is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + /* Don't wait if h/w is not enabled. */ + if ((ctrl->control & AMDVI_CTRL_EN) == 0) + return; + + for (i = 0; i < 10; i++) { + if (amdvi_cmp_wait(softc)) + return; + } + + device_printf(softc->dev, "Error: completion failed" + " tail:0x%x, head:0x%x.\n", + ctrl->cmd_tail, ctrl->cmd_head); + amdvi_dump_cmds(softc); +} + +static void +amdvi_dump_cmds(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *cmd; + int off, i; + + ctrl = softc->ctrl; + device_printf(softc->dev, "Dump all the commands:\n"); + /* + * If h/w is stuck in completion, it is the previous command, + * start dumping from previous command onward. + */ + off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd), + softc->cmd_max); + for (i = 0; off != ctrl->cmd_tail && + i < softc->cmd_max; i++) { + cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off); + printf(" [CMD%d, off:0x%x] opcode= 0x%x 0x%x" + " 0x%x 0x%lx\n", i, off, cmd->opcode, + cmd->word0, cmd->word1, cmd->addr); + off = (off + sizeof(struct amdvi_cmd)) % + (softc->cmd_max * sizeof(struct amdvi_cmd)); + } +} + +static int +amdvi_init_event(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->event.len = 8; + softc->event_max = 1 << ctrl->event.len; + softc->event = malloc(sizeof(struct amdvi_event) * + softc->event_max, M_AMDVI, M_WAITOK | M_ZERO); + if ((uintptr_t)softc->event & PAGE_MASK) { + device_printf(softc->dev, "Event buffer not aligned on page."); + return (false); + } + ctrl->event.base = vtophys(softc->event) / PAGE_SIZE; + + /* Reset the pointers. */ + ctrl->evt_head = 0; + ctrl->evt_tail = 0; + + return (0); +} + +static inline void +amdvi_decode_evt_flag(uint16_t flag) +{ + + flag &= AMDVI_EVENT_FLAG_MASK; + printf(" 0x%b]\n", flag, + "\020" + "\001GN" + "\002NX" + "\003US" + "\004I" + "\005PR" + "\006RW" + "\007PE" + "\010RZ" + "\011TR" + ); +} + +/* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/ +static inline void +amdvi_decode_evt_flag_type(uint8_t type) +{ + + switch (AMDVI_EVENT_FLAG_TYPE(type)) { + case 0: + printf("RSVD\n"); + break; + case 1: + printf("Master Abort\n"); + break; + case 2: + printf("Target Abort\n"); + break; + case 3: + printf("Data Err\n"); + break; + default: + break; + } +} + +static void +amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid, + uint64_t addr, uint16_t flag) +{ + + printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(flag); +} + +static void +amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag)); +} + +static void +amdvi_decode_evt(struct amdvi_event *evt) +{ + struct amdvi_cmd *cmd; + + switch (evt->opcode) { + case AMDVI_EVENT_INVALID_DTE: + amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PFAULT: + amdvi_decode_pf_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_DTE_HW_ERROR: + amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PAGE_HW_ERROR: + amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_ILLEGAL_CMD: + /* FALL THROUGH */ + case AMDVI_EVENT_CMD_HW_ERROR: + printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? + "ILLEGAL CMD" : "CMD HW ERR"); + cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr); + printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n", + cmd->opcode, cmd->word0, cmd->word1, cmd->addr); + break; + + case AMDVI_EVENT_IOTLB_TIMEOUT: + printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n", + evt->devid, evt->addr); + break; + + case AMDVI_EVENT_INVALID_DTE_REQ: + printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n", + evt->devid, evt->addr, evt->flag >> 9, + (evt->flag >> 8) & 1); + break; + + case AMDVI_EVENT_INVALID_PPR_REQ: + case AMDVI_EVENT_COUNTER_ZERO: + printf("AMD-Vi: v2 events.\n"); + break; + + default: + printf("Unsupported AMD-Vi event:%d\n", evt->opcode); + } +} + +static void +amdvi_print_events(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_event *event; + int i, size; + + ctrl = softc->ctrl; + size = sizeof(struct amdvi_event); + for (i = 0; i < softc->event_max; i++) { + event = &softc->event[ctrl->evt_head / size]; + if (!event->opcode) + break; + device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n", + i, ctrl->evt_head, ctrl->evt_tail); + amdvi_decode_evt(event); + ctrl->evt_head = MOD_INC(ctrl->evt_head, size, + softc->event_max); + } +} + +static int +amdvi_init_dte(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE; + ctrl->dte.size = 0x1FF; /* 2MB device table. */ + + return (0); +} + +/* + * Not all capabilities of IOMMU are available in ACPI IVHD flag + * or EFR entry, read directly from device. + */ +static int +amdvi_print_pci_cap(device_t dev) +{ + struct amdvi_softc *softc; + uint32_t off, cap; + + + softc = device_get_softc(dev); + off = softc->cap_off; + + /* + * Section 3.7.1 of IOMMU sepc rev 2.0. + * Read capability from device. + */ + cap = amdvi_pci_read(softc, off); + + /* Make sure capability type[18:16] is 3. */ + KASSERT((((cap >> 16) & 0x7) == 0x3), + ("Not a IOMMU capability 0x%x@0x%x", cap, off)); + + softc->pci_cap = cap >> 24; + device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n", + cap, off, softc->pci_cap, + "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt"); + + return (0); +} + +static void +amdvi_event_intr(void *arg) +{ + struct amdvi_softc *softc; + struct amdvi_ctrl *ctrl; + + softc = (struct amdvi_softc *)arg; + ctrl = softc->ctrl; + device_printf(softc->dev, "EVT INTR %ld Status:0x%x" + " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++, + ctrl->status, ctrl->evt_head, ctrl->evt_tail); + printf(" [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n", + softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head); + + amdvi_print_events(softc); + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; +} + +static void +amdvi_free_evt_intr_res(device_t dev) +{ + + struct amdvi_softc *softc; + + softc = device_get_softc(dev); + if (softc->event_tag != NULL) { + bus_teardown_intr(dev, softc->event_res, softc->event_tag); + } + if (softc->event_res != NULL) { + bus_release_resource(dev, SYS_RES_IRQ, softc->event_rid, + softc->event_res); + } + bus_delete_resource(dev, SYS_RES_IRQ, softc->event_rid); + PCIB_RELEASE_MSI(device_get_parent(device_get_parent(dev)), + dev, 1, &softc->event_irq); +} + +static bool +amdvi_alloc_intr_resources(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + device_t dev, pcib; + device_t mmio_dev; + uint64_t msi_addr; + uint32_t msi_data; + int err; + + dev = softc->dev; + pcib = device_get_parent(device_get_parent(dev)); + mmio_dev = pci_find_bsf(PCI_RID2BUS(softc->pci_rid), + PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid)); + if (device_is_attached(mmio_dev)) { + device_printf(dev, + "warning: IOMMU device is claimed by another driver %s\n", + device_get_driver(mmio_dev)->name); + } + + softc->event_irq = -1; + softc->event_rid = 0; + + /* + * Section 3.7.1 of IOMMU rev 2.0. With MSI, there is only one + * interrupt. XXX: Enable MSI/X support. + */ + err = PCIB_ALLOC_MSI(pcib, dev, 1, 1, &softc->event_irq); + if (err) { + device_printf(dev, + "Couldn't find event MSI IRQ resource.\n"); + return (ENOENT); + } + + err = bus_set_resource(dev, SYS_RES_IRQ, softc->event_rid, + softc->event_irq, 1); + if (err) { + device_printf(dev, "Couldn't set event MSI resource.\n"); + return (ENXIO); + } + + softc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &softc->event_rid, RF_ACTIVE); + if (!softc->event_res) { + device_printf(dev, + "Unable to allocate event INTR resource.\n"); + return (ENOMEM); + } + + if (bus_setup_intr(dev, softc->event_res, + INTR_TYPE_MISC | INTR_MPSAFE, NULL, amdvi_event_intr, + softc, &softc->event_tag)) { + device_printf(dev, "Fail to setup event intr\n"); + bus_release_resource(softc->dev, SYS_RES_IRQ, + softc->event_rid, softc->event_res); + softc->event_res = NULL; + return (ENXIO); + } + + bus_describe_intr(dev, softc->event_res, softc->event_tag, + "fault"); + + err = PCIB_MAP_MSI(pcib, dev, softc->event_irq, &msi_addr, + &msi_data); + if (err) { + device_printf(dev, + "Event interrupt config failed, err=%d.\n", + err); + amdvi_free_evt_intr_res(softc->dev); + return (err); + } + + /* Clear interrupt status bits. */ + ctrl = softc->ctrl; + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; + + /* Now enable MSI interrupt. */ + pci_enable_msi(mmio_dev, msi_addr, msi_data); + return (0); +} + + +static void +amdvi_print_dev_cap(struct amdvi_softc *softc) +{ + struct ivhd_dev_cfg *cfg; + int i; + + cfg = softc->dev_cfg; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + device_printf(softc->dev, "device [0x%x - 0x%x]" + "config:%b%s\n", cfg->start_id, cfg->end_id, + cfg->data, + "\020\001INIT\002ExtInt\003NMI" + "\007LINT0\008LINT1", + cfg->enable_ats ? "ATS enabled" : ""); + cfg++; + } +} + +static int +amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct amdvi_softc *softc; + int result, type, error = 0; + + softc = (struct amdvi_softc *)arg1; + type = arg2; + + switch (type) { + case 0: + result = softc->ctrl->cmd_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 1: + result = softc->ctrl->cmd_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 2: + result = softc->ctrl->evt_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 3: + result = softc->ctrl->evt_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + + default: + device_printf(softc->dev, "Unknown sysctl:%d\n", type); + } + + return (error); +} + +static void +amdvi_add_sysctl(struct amdvi_softc *softc) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + device_t dev; + + dev = softc->dev; + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD, + &softc->event_intr_cnt, "Event interrupt count"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD, + &softc->total_cmd, "Command submitted count"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, + &softc->pci_rid, 0, "IOMMU RID"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD, + &softc->start_dev_rid, 0, "Start of device under this IOMMU"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD, + &softc->end_dev_rid, 0, "End of device under this IOMMU"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head", + CTLTYPE_UINT | CTLFLAG_RD, softc, 0, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail", + CTLTYPE_UINT | CTLFLAG_RD, softc, 1, + amdvi_handle_sysctl, "IU", "Command tail"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head", + CTLTYPE_UINT | CTLFLAG_RD, softc, 2, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail", + CTLTYPE_UINT | CTLFLAG_RD, softc, 3, + amdvi_handle_sysctl, "IU", "Command tail"); +} + +int +amdvi_setup_hw(struct amdvi_softc *softc) +{ + device_t dev; + int status; + + dev = softc->dev; + + amdvi_hw_enable_iotlb(softc); + + amdvi_print_dev_cap(softc); + + if ((status = amdvi_print_pci_cap(dev)) != 0) { + device_printf(dev, "PCI capability.\n"); + return (status); + } + if ((status = amdvi_init_cmd(softc)) != 0) { + device_printf(dev, "Couldn't configure command buffer.\n"); + return (status); + } + if ((status = amdvi_init_event(softc)) != 0) { + device_printf(dev, "Couldn't configure event buffer.\n"); + return (status); + } + if ((status = amdvi_init_dte(softc)) != 0) { + device_printf(dev, "Couldn't configure device table.\n"); + return (status); + } + if ((status = amdvi_alloc_intr_resources(softc)) != 0) { + return (status); + } + amdvi_add_sysctl(softc); + return (0); +} + +int +amdvi_teardown_hw(struct amdvi_softc *softc) +{ + device_t dev; + + dev = softc->dev; + + /* + * Called after disable, h/w is stopped by now, free all the resources. + */ + amdvi_free_evt_intr_res(dev); + + if (softc->cmd) + free(softc->cmd, M_AMDVI); + + if (softc->event) + free(softc->event, M_AMDVI); + + return (0); +} + +/*********** bhyve interfaces *********************/ +static int +amdvi_init(void) +{ + if (!ivhd_count) { + return (EIO); + } + if (!amdvi_enable_user && ivhd_count) { + printf("bhyve: Found %d AMD-Vi/IOMMU device(s), " + "use hw.vmm.amdvi.enable=1 to enable pass-through.\n", + ivhd_count); + return (EINVAL); + } + return (0); +} + +static void +amdvi_cleanup(void) +{ + /* Nothing. */ +} + +static uint16_t +amdvi_domainId(void) +{ + + /* + * If we hit maximum domain limit, rollover leaving host + * domain(0). + * XXX: make sure that this domain is not used. + */ + if (amdvi_dom_id == AMDVI_MAX_DOMAIN) + amdvi_dom_id = 1; + + return ((uint16_t)amdvi_dom_id++); +} + +static void +amdvi_do_inv_domain(uint16_t domain_id, bool create) +{ + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL")); + /* + * If not present pages are cached, invalidate page after + * creating domain. + */ +#if 0 + if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0)) + continue; +#endif + amdvi_inv_domain(softc, domain_id); + amdvi_wait(softc); + } +} + +static void * +amdvi_create_domain(vm_paddr_t maxaddr) +{ + struct amdvi_domain *dom; + + dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK); + dom->id = amdvi_domainId(); + //dom->maxaddr = maxaddr; +#ifdef AMDVI_DEBUG_CMD + printf("Created domain #%d\n", dom->id); +#endif + /* + * Host domain(#0) don't create translation table. + */ + if (dom->id || amdvi_host_ptp) + dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + + dom->ptp_level = amdvi_ptp_level; + + amdvi_do_inv_domain(dom->id, true); + SLIST_INSERT_HEAD(&dom_head, dom, next); + + return (dom); +} + +static void +amdvi_free_ptp(uint64_t *ptp, int level) +{ + int i; + + if (level < 1) + return; + + for (i = 0; i < NPTEPG ; i++) { + if ((ptp[i] & AMDVI_PT_PRESENT) == 0) + continue; + /* XXX: Add super-page or PTE mapping > 4KB. */ +#ifdef notyet + /* Super-page mapping. */ + if (AMDVI_PD_SUPER(ptp[i])) + continue; +#endif + + amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] + & AMDVI_PT_MASK), level - 1); + + } + + free(ptp, M_AMDVI); +} + +static void +amdvi_destroy_domain(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Destroying domain %d\n", domain->id); +#endif + if (domain->ptp) + amdvi_free_ptp(domain->ptp, domain->ptp_level); + + amdvi_do_inv_domain(domain->id, false); + SLIST_REMOVE(&dom_head, domain, amdvi_domain, next); + free(domain, M_AMDVI); +} + +static uint64_t +amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t pg_size, bool create) +{ + uint64_t *page, pa; + int shift, index; + const int PT_SHIFT = 9; + const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1; /* Based on PT_SHIFT */ + + if (!pg_size) + return (0); + + if (hpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + if (gpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + shift = PML4SHIFT; + while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) { + index = (gpa >> shift) & PT_INDEX_MASK; + + if ((pt[index] == 0) && create) { + page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + pa = vtophys(page); + pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW | + ((level - 1) << AMDVI_PD_LEVEL_SHIFT); + } +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif +#define PTE2PA(x) ((uint64_t)(x) & AMDVI_PT_MASK) + pa = PTE2PA(pt[index]); + pt = (uint64_t *)PHYS_TO_DMAP(pa); + shift -= PT_SHIFT; + level--; + } + + /* Leaf entry. */ + index = (gpa >> shift) & PT_INDEX_MASK; + + if (create) { + pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT; + } else + pt[index] = 0; + +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[Last level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif + return (1ULL << shift); +} + +static uint64_t +amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t size, bool create) +{ + uint64_t mapped, *ptp, len; + int level; + + KASSERT(domain, ("domain is NULL")); + level = domain->ptp_level; + KASSERT(level, ("Page table level is 0")); + + ptp = domain->ptp; + KASSERT(ptp, ("PTP is NULL")); + mapped = 0; + while (mapped < size) { + len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped, + PAGE_SIZE, create); + if (!len) { + printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n", + hpa, gpa); + return (0); + } + mapped += len; + } + + return (mapped); +} + +static uint64_t +amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + + if (domain->id && !domain->ptp) { + printf("ptp is NULL"); + return (-1); + } + + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + return (amdvi_update_mapping(domain, gpa, hpa, len, true)); + else + return (len); +} + +static uint64_t +amdvi_destroy_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + return (amdvi_update_mapping(domain, gpa, 0, len, false)); + return + (len); +} + +static struct amdvi_softc * +amdvi_find_iommu(uint16_t devid) +{ + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + if ((devid >= softc->start_dev_rid) && + (devid <= softc->end_dev_rid)) + return (softc); + } + + /* + * XXX: BIOS bug, device not in IVRS table, assume its from first IOMMU. + */ + printf("BIOS bug device(%d.%d.%d) doesn't have IVHD entry.\n", + RID2PCI_STR(devid)); + + return (device_get_softc(ivhd_devs[0])); +} + +/* + * Set-up device table entry. + * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must + * be set concurrently, e.g. read and write bits. + */ +static void +amdvi_set_dte(struct amdvi_domain *domain, uint16_t devid, bool enable) +{ + struct amdvi_softc *softc; + struct amdvi_dte* temp; + + KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid)); + + softc = amdvi_find_iommu(devid); + KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid)); + + temp = &amdvi_dte[devid]; + +#ifdef AMDVI_ATS_ENABLE + /* If IOMMU and device support IOTLB, enable it. */ + if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb) + temp->iotlb_enable = 1; +#endif + + /* Avoid duplicate I/O faults. */ + temp->sup_second_io_fault = 1; + temp->sup_all_io_fault = amdvi_disable_io_fault; + + temp->dt_valid = 1; + temp->domain_id = domain->id; + + if (enable) { + if (domain->ptp) { + temp->pt_base = vtophys(domain->ptp) >> 12; + temp->pt_level = amdvi_ptp_level; + } + /* + * XXX: Page table valid[TV] bit must be set even if host domain + * page tables are not enabled. + */ + temp->pt_valid = 1; + temp->read_allow = 1; + temp->write_allow = 1; + } +} + +static void +amdvi_inv_device(uint16_t devid) +{ + struct amdvi_softc *softc; + + softc = amdvi_find_iommu(devid); + KASSERT(softc, ("softc is NULL")); + + amdvi_cmd_inv_dte(softc, devid); +#ifdef AMDVI_ATS_ENABLE + if (amdvi_dev_support_iotlb(softc, devid)) + amdvi_cmd_inv_iotlb(softc, devid); +#endif + amdvi_wait(softc); +} + +static void +amdvi_add_device(void *arg, uint16_t devid) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain != NULL, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Assigning device(%d.%d.%d) to domain:%d\n", + RID2PCI_STR(devid), domain->id); +#endif + amdvi_set_dte(domain, devid, true); + amdvi_inv_device(devid); +} + +static void +amdvi_remove_device(void *arg, uint16_t devid) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; +#ifdef AMDVI_DEBUG_CMD + printf("Remove device(0x%x) from domain:%d\n", + devid, domain->id); +#endif + amdvi_set_dte(domain, devid, false); + amdvi_inv_device(devid); +} + +static void +amdvi_enable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + uint64_t val; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + val = ( AMDVI_CTRL_EN | + AMDVI_CTRL_CMD | + AMDVI_CTRL_ELOG | + AMDVI_CTRL_ELOGINT | + AMDVI_CTRL_INV_TO_1S); + + if (softc->ivhd_flag & IVHD_FLAG_COH) + val |= AMDVI_CTRL_COH; + if (softc->ivhd_flag & IVHD_FLAG_HTT) + val |= AMDVI_CTRL_HTT; + if (softc->ivhd_flag & IVHD_FLAG_RPPW) + val |= AMDVI_CTRL_RPPW; + if (softc->ivhd_flag & IVHD_FLAG_PPW) + val |= AMDVI_CTRL_PPW; + if (softc->ivhd_flag & IVHD_FLAG_ISOC) + val |= AMDVI_CTRL_ISOC; + + ctrl->control = val; + } +} + +static void +amdvi_disable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + ctrl->control = 0; + } +} + +static void +amdvi_inv_tlb(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); + amdvi_do_inv_domain(domain->id, false); +} + +struct iommu_ops iommu_ops_amd = { + amdvi_init, + amdvi_cleanup, + amdvi_enable, + amdvi_disable, + amdvi_create_domain, + amdvi_destroy_domain, + amdvi_create_mapping, + amdvi_destroy_mapping, + amdvi_add_device, + amdvi_remove_device, + amdvi_inv_tlb +}; diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h b/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h new file mode 100644 index 0000000000..6ee6c36632 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h @@ -0,0 +1,431 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _AMDVI_PRIV_H_ +#define _AMDVI_PRIV_H_ + +#include <contrib/dev/acpica/include/acpi.h> + +#define BIT(n) (1ULL << (n)) +/* Return value of bits[n:m] where n and (n >= ) m are bit positions. */ +#define REG_BITS(x, n, m) (((x) >> (m)) & \ + ((1 << (((n) - (m)) + 1)) - 1)) + +/* + * IOMMU PCI capability. + */ +#define AMDVI_PCI_CAP_IOTLB BIT(0) /* IOTLB is supported. */ +#define AMDVI_PCI_CAP_HT BIT(1) /* HyperTransport tunnel support. */ +#define AMDVI_PCI_CAP_NPCACHE BIT(2) /* Not present page cached. */ +#define AMDVI_PCI_CAP_EFR BIT(3) /* Extended features. */ +#define AMDVI_PCI_CAP_EXT BIT(4) /* Miscellaneous information reg. */ + +/* + * IOMMU extended features. + */ +#define AMDVI_EX_FEA_PREFSUP BIT(0) /* Prefetch command support. */ +#define AMDVI_EX_FEA_PPRSUP BIT(1) /* PPR support */ +#define AMDVI_EX_FEA_XTSUP BIT(2) /* Reserved */ +#define AMDVI_EX_FEA_NXSUP BIT(3) /* No-execute. */ +#define AMDVI_EX_FEA_GTSUP BIT(4) /* Guest translation support. */ +#define AMDVI_EX_FEA_EFRW BIT(5) /* Reserved */ +#define AMDVI_EX_FEA_IASUP BIT(6) /* Invalidate all command supp. */ +#define AMDVI_EX_FEA_GASUP BIT(7) /* Guest APIC or AVIC support. */ +#define AMDVI_EX_FEA_HESUP BIT(8) /* Hardware Error. */ +#define AMDVI_EX_FEA_PCSUP BIT(9) /* Performance counters support. */ +/* XXX: add more EFER bits. */ + +/* + * Device table entry or DTE + * NOTE: Must be 256-bits/32 bytes aligned. + */ +struct amdvi_dte { + uint32_t dt_valid:1; /* Device Table valid. */ + uint32_t pt_valid:1; /* Page translation valid. */ + uint16_t :7; /* Reserved[8:2] */ + uint8_t pt_level:3; /* Paging level, 0 to disable. */ + uint64_t pt_base:40; /* Page table root pointer. */ + uint8_t :3; /* Reserved[54:52] */ + uint8_t gv_valid:1; /* Revision 2, GVA to SPA. */ + uint8_t gv_level:2; /* Revision 2, GLX level. */ + uint8_t gv_cr3_lsb:3; /* Revision 2, GCR3[14:12] */ + uint8_t read_allow:1; /* I/O read enabled. */ + uint8_t write_allow:1; /* I/O write enabled. */ + uint8_t :1; /* Reserved[63] */ + uint16_t domain_id:16; /* Domain ID */ + uint16_t gv_cr3_lsb2:16; /* Revision 2, GCR3[30:15] */ + uint8_t iotlb_enable:1; /* Device support IOTLB */ + uint8_t sup_second_io_fault:1; /* Suppress subsequent I/O faults. */ + uint8_t sup_all_io_fault:1; /* Suppress all I/O page faults. */ + uint8_t IOctl:2; /* Port I/O control. */ + uint8_t iotlb_cache_disable:1; /* IOTLB cache hints. */ + uint8_t snoop_disable:1; /* Snoop disable. */ + uint8_t allow_ex:1; /* Allow exclusion. */ + uint8_t sysmgmt:2; /* System management message.*/ + uint8_t :1; /* Reserved[106] */ + uint32_t gv_cr3_msb:21; /* Revision 2, GCR3[51:31] */ + uint8_t intmap_valid:1; /* Interrupt map valid. */ + uint8_t intmap_len:4; /* Interrupt map table length. */ + uint8_t intmap_ign:1; /* Ignore unmapped interrupts. */ + uint64_t intmap_base:46; /* IntMap base. */ + uint8_t :4; /* Reserved[183:180] */ + uint8_t init_pass:1; /* INIT pass through or PT */ + uint8_t extintr_pass:1; /* External Interrupt PT */ + uint8_t nmi_pass:1; /* NMI PT */ + uint8_t :1; /* Reserved[187] */ + uint8_t intr_ctrl:2; /* Interrupt control */ + uint8_t lint0_pass:1; /* LINT0 PT */ + uint8_t lint1_pass:1; /* LINT1 PT */ + uint64_t :64; /* Reserved[255:192] */ +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_dte) == 32); + +/* + * IOMMU command entry. + */ +struct amdvi_cmd { + uint32_t word0; + uint32_t word1:28; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); + +/* Command opcodes. */ +#define AMDVI_CMP_WAIT_OPCODE 0x1 /* Completion wait. */ +#define AMDVI_INVD_DTE_OPCODE 0x2 /* Invalidate device table entry. */ +#define AMDVI_INVD_PAGE_OPCODE 0x3 /* Invalidate pages. */ +#define AMDVI_INVD_IOTLB_OPCODE 0x4 /* Invalidate IOTLB pages. */ +#define AMDVI_INVD_INTR_OPCODE 0x5 /* Invalidate Interrupt table. */ +#define AMDVI_PREFETCH_PAGES_OPCODE 0x6 /* Prefetch IOMMU pages. */ +#define AMDVI_COMP_PPR_OPCODE 0x7 /* Complete PPR request. */ +#define AMDVI_INV_ALL_OPCODE 0x8 /* Invalidate all. */ + +/* Completion wait attributes. */ +#define AMDVI_CMP_WAIT_STORE BIT(0) /* Write back data. */ +#define AMDVI_CMP_WAIT_INTR BIT(1) /* Completion wait interrupt. */ +#define AMDVI_CMP_WAIT_FLUSH BIT(2) /* Flush queue. */ + +/* Invalidate page. */ +#define AMDVI_INVD_PAGE_S BIT(0) /* Invalidation size. */ +#define AMDVI_INVD_PAGE_PDE BIT(1) /* Invalidate PDE. */ +#define AMDVI_INVD_PAGE_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_PAGE_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) + +/* Invalidate IOTLB. */ +#define AMDVI_INVD_IOTLB_S BIT(0) /* Invalidation size 4k or addr */ +#define AMDVI_INVD_IOTLB_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_IOTLB_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) +/* XXX: add more command entries. */ + +/* + * IOMMU event entry. + */ +struct amdvi_event { + uint16_t devid; + uint16_t pasid_hi; + uint16_t pasid_domid; /* PASID low or DomainID */ + uint16_t flag:12; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_event) == 16); + +/* Various event types. */ +#define AMDVI_EVENT_INVALID_DTE 0x1 +#define AMDVI_EVENT_PFAULT 0x2 +#define AMDVI_EVENT_DTE_HW_ERROR 0x3 +#define AMDVI_EVENT_PAGE_HW_ERROR 0x4 +#define AMDVI_EVENT_ILLEGAL_CMD 0x5 +#define AMDVI_EVENT_CMD_HW_ERROR 0x6 +#define AMDVI_EVENT_IOTLB_TIMEOUT 0x7 +#define AMDVI_EVENT_INVALID_DTE_REQ 0x8 +#define AMDVI_EVENT_INVALID_PPR_REQ 0x9 +#define AMDVI_EVENT_COUNTER_ZERO 0xA + +#define AMDVI_EVENT_FLAG_MASK 0x1FF /* Mask for event flags. */ +#define AMDVI_EVENT_FLAG_TYPE(x) (((x) >> 9) & 0x3) + +/* + * IOMMU control block. + */ +struct amdvi_ctrl { + struct { + uint16_t size:9; + uint16_t :3; + uint64_t base:40; /* Devtable register base. */ + uint16_t :12; + } dte; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } cmd; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } event; + uint16_t control :13; + uint64_t :51; + struct { + uint8_t enable:1; + uint8_t allow:1; + uint16_t :10; + uint64_t base:40; + uint16_t :12; + uint16_t :12; + uint64_t limit:40; + uint16_t :12; + } excl; + /* + * Revision 2 only. + */ + uint64_t ex_feature; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } ppr; + uint64_t first_event; + uint64_t second_event; + uint64_t event_status; + /* Revision 2 only, end. */ + uint8_t pad1[0x1FA8]; /* Padding. */ + uint32_t cmd_head:19; + uint64_t :45; + uint32_t cmd_tail:19; + uint64_t :45; + uint32_t evt_head:19; + uint64_t :45; + uint32_t evt_tail:19; + uint64_t :45; + uint32_t status:19; + uint64_t :45; + uint64_t pad2; + uint8_t :4; + uint16_t ppr_head:15; + uint64_t :45; + uint8_t :4; + uint16_t ppr_tail:15; + uint64_t :45; + uint8_t pad3[0x1FC0]; /* Padding. */ + + /* XXX: More for rev2. */ +} __attribute__((__packed__)); +CTASSERT(offsetof(struct amdvi_ctrl, pad1)== 0x58); +CTASSERT(offsetof(struct amdvi_ctrl, pad2)== 0x2028); +CTASSERT(offsetof(struct amdvi_ctrl, pad3)== 0x2040); + +#define AMDVI_MMIO_V1_SIZE (4 * PAGE_SIZE) /* v1 size */ +/* + * AMF IOMMU v2 size including event counters + */ +#define AMDVI_MMIO_V2_SIZE (8 * PAGE_SIZE) + +CTASSERT(sizeof(struct amdvi_ctrl) == 0x4000); +CTASSERT(sizeof(struct amdvi_ctrl) == AMDVI_MMIO_V1_SIZE); + +/* IVHD flag */ +#define IVHD_FLAG_HTT BIT(0) /* Hypertransport Tunnel. */ +#define IVHD_FLAG_PPW BIT(1) /* Pass posted write. */ +#define IVHD_FLAG_RPPW BIT(2) /* Response pass posted write. */ +#define IVHD_FLAG_ISOC BIT(3) /* Isoc support. */ +#define IVHD_FLAG_IOTLB BIT(4) /* IOTLB support. */ +#define IVHD_FLAG_COH BIT(5) /* Coherent control, default 1 */ +#define IVHD_FLAG_PFS BIT(6) /* Prefetch IOMMU pages. */ +#define IVHD_FLAG_PPRS BIT(7) /* Peripheral page support. */ + +/* IVHD device entry data setting. */ +#define IVHD_DEV_LINT0_PASS BIT(6) /* LINT0 interrupts. */ +#define IVHD_DEV_LINT1_PASS BIT(7) /* LINT1 interrupts. */ + +/* Bit[5:4] for System Mgmt. Bit3 is reserved. */ +#define IVHD_DEV_INIT_PASS BIT(0) /* INIT */ +#define IVHD_DEV_EXTINTR_PASS BIT(1) /* ExtInt */ +#define IVHD_DEV_NMI_PASS BIT(2) /* NMI */ + +/* IVHD 8-byte extended data settings. */ +#define IVHD_DEV_EXT_ATS_DISABLE BIT(31) /* Disable ATS */ + +/* IOMMU control register. */ +#define AMDVI_CTRL_EN BIT(0) /* IOMMU enable. */ +#define AMDVI_CTRL_HTT BIT(1) /* Hypertransport tunnel enable. */ +#define AMDVI_CTRL_ELOG BIT(2) /* Event log enable. */ +#define AMDVI_CTRL_ELOGINT BIT(3) /* Event log interrupt. */ +#define AMDVI_CTRL_COMINT BIT(4) /* Completion wait interrupt. */ +#define AMDVI_CTRL_PPW BIT(8) +#define AMDVI_CTRL_RPPW BIT(9) +#define AMDVI_CTRL_COH BIT(10) +#define AMDVI_CTRL_ISOC BIT(11) +#define AMDVI_CTRL_CMD BIT(12) /* Command buffer enable. */ +#define AMDVI_CTRL_PPRLOG BIT(13) +#define AMDVI_CTRL_PPRINT BIT(14) +#define AMDVI_CTRL_PPREN BIT(15) +#define AMDVI_CTRL_GTE BIT(16) /* Guest translation enable. */ +#define AMDVI_CTRL_GAE BIT(17) /* Guest APIC enable. */ + +/* Invalidation timeout. */ +#define AMDVI_CTRL_INV_NO_TO 0 /* No timeout. */ +#define AMDVI_CTRL_INV_TO_1ms 1 /* 1 ms */ +#define AMDVI_CTRL_INV_TO_10ms 2 /* 10 ms */ +#define AMDVI_CTRL_INV_TO_100ms 3 /* 100 ms */ +#define AMDVI_CTRL_INV_TO_1S 4 /* 1 second */ +#define AMDVI_CTRL_INV_TO_10S 5 /* 10 second */ +#define AMDVI_CTRL_INV_TO_100S 6 /* 100 second */ + +/* + * Max number of PCI devices. + * 256 bus x 32 slot/devices x 8 functions. + */ +#define PCI_NUM_DEV_MAX 0x10000 + +/* Maximum number of domains supported by IOMMU. */ +#define AMDVI_MAX_DOMAIN (BIT(16) - 1) + +/* + * IOMMU Page Table attributes. + */ +#define AMDVI_PT_PRESENT BIT(0) +#define AMDVI_PT_COHERENT BIT(60) +#define AMDVI_PT_READ BIT(61) +#define AMDVI_PT_WRITE BIT(62) + +#define AMDVI_PT_RW (AMDVI_PT_READ | AMDVI_PT_WRITE) +#define AMDVI_PT_MASK 0xFFFFFFFFFF000UL /* Only [51:12] for PA */ + +#define AMDVI_PD_LEVEL_SHIFT 9 +#define AMDVI_PD_SUPER(x) (((x) >> AMDVI_PD_LEVEL_SHIFT) == 7) +/* + * IOMMU Status, offset 0x2020 + */ +#define AMDVI_STATUS_EV_OF BIT(0) /* Event overflow. */ +#define AMDVI_STATUS_EV_INTR BIT(1) /* Event interrupt. */ +/* Completion wait command completed. */ +#define AMDVI_STATUS_CMP BIT(2) + +#define IVRS_CTRL_RID 1 /* MMIO RID */ + +/* ACPI IVHD */ +struct ivhd_dev_cfg { + uint32_t start_id; + uint32_t end_id; + uint8_t data; /* Device configuration. */ + bool enable_ats; /* ATS enabled for the device. */ + int ats_qlen; /* ATS invalidation queue depth. */ +}; + +struct amdvi_domain { + uint64_t *ptp; /* Highest level page table */ + int ptp_level; /* Level of page tables */ + u_int id; /* Domain id */ + SLIST_ENTRY (amdvi_domain) next; +}; + +/* + * I/O Virtualization Hardware Definition Block (IVHD) type 0x10 (legacy) + * uses ACPI_IVRS_HARDWARE define in contrib/dev/acpica/include/actbl2.h + * New IVHD types 0x11 and 0x40 as defined in AMD IOMMU spec[48882] are missing in + * ACPI code. These new types add extra field EFR(Extended Feature Register). + * XXX : Use definition from ACPI when it is available. + */ +typedef struct acpi_ivrs_hardware_efr_sup +{ + ACPI_IVRS_HEADER Header; + UINT16 CapabilityOffset; /* Offset for IOMMU control fields */ + UINT64 BaseAddress; /* IOMMU control registers */ + UINT16 PciSegmentGroup; + UINT16 Info; /* MSI number and unit ID */ + UINT32 Attr; /* IOMMU Feature */ + UINT64 ExtFR; /* IOMMU Extended Feature */ + UINT64 Reserved; /* v1 feature or v2 attribute */ +} __attribute__ ((__packed__)) ACPI_IVRS_HARDWARE_EFRSUP; +CTASSERT(sizeof(ACPI_IVRS_HARDWARE_EFRSUP) == 40); + +/* + * Different type of IVHD. + * XXX: Use AcpiIvrsType once new IVHD types are available. +*/ +enum IvrsType +{ + IVRS_TYPE_HARDWARE_LEGACY = 0x10, /* Legacy without EFRi support. */ + IVRS_TYPE_HARDWARE_EFR = 0x11, /* With EFR support. */ + IVRS_TYPE_HARDWARE_MIXED = 0x40, /* Mixed with EFR support. */ +}; + +/* + * AMD IOMMU softc. + */ +struct amdvi_softc { + struct amdvi_ctrl *ctrl; /* Control area. */ + device_t dev; /* IOMMU device. */ + enum IvrsType ivhd_type; /* IOMMU IVHD type. */ + bool iotlb; /* IOTLB supported by IOMMU */ + struct amdvi_cmd *cmd; /* Command descriptor area. */ + int cmd_max; /* Max number of commands. */ + uint64_t cmp_data; /* Command completion write back. */ + struct amdvi_event *event; /* Event descriptor area. */ + struct resource *event_res; /* Event interrupt resource. */ + void *event_tag; /* Event interrupt tag. */ + int event_max; /* Max number of events. */ + int event_irq; + int event_rid; + /* ACPI various flags. */ + uint32_t ivhd_flag; /* ACPI IVHD flag. */ + uint32_t ivhd_feature; /* ACPI v1 Reserved or v2 attribute. */ + uint64_t ext_feature; /* IVHD EFR */ + /* PCI related. */ + uint16_t cap_off; /* PCI Capability offset. */ + uint8_t pci_cap; /* PCI capability. */ + uint16_t pci_seg; /* IOMMU PCI domain/segment. */ + uint16_t pci_rid; /* PCI BDF of IOMMU */ + /* Device range under this IOMMU. */ + uint16_t start_dev_rid; /* First device under this IOMMU. */ + uint16_t end_dev_rid; /* Last device under this IOMMU. */ + + /* BIOS provided device configuration for end points. */ + struct ivhd_dev_cfg dev_cfg[10]; + int dev_cfg_cnt; + + /* Software statistics. */ + uint64_t event_intr_cnt; /* Total event INTR count. */ + uint64_t total_cmd; /* Total number of commands. */ +}; + +int amdvi_setup_hw(struct amdvi_softc *softc); +int amdvi_teardown_hw(struct amdvi_softc *softc); +#endif /* _AMDVI_PRIV_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c new file mode 100644 index 0000000000..370c20fb01 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c @@ -0,0 +1,735 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_acpi.h" +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> + +#include <machine/vmparam.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/accommon.h> +#include <dev/acpica/acpivar.h> + +#include "io/iommu.h" +#include "amdvi_priv.h" + +device_t *ivhd_devs; /* IVHD or AMD-Vi device list. */ +int ivhd_count; /* Number of IVHD header. */ +/* + * Cached IVHD header list. + * Single entry for each IVHD, filtered the legacy one. + */ +ACPI_IVRS_HARDWARE *ivhd_hdrs[10]; + +extern int amdvi_ptp_level; /* Page table levels. */ + +typedef int (*ivhd_iter_t)(ACPI_IVRS_HEADER *ptr, void *arg); +/* + * Iterate IVRS table for IVHD and IVMD device type. + */ +static void +ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HEADER *ivrs_hdr, *end; + ACPI_STATUS status; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)(ivrs + 1); + end = (ACPI_IVRS_HEADER *)((char *)ivrs + ivrs->Header.Length); + + while (ivrs_hdr < end) { + if ((uint8_t *)ivrs_hdr + ivrs_hdr->Length > (uint8_t *)end) { + printf("AMD-Vi:IVHD/IVMD is corrupted, length : %d\n", + ivrs_hdr->Length); + break; + } + + switch (ivrs_hdr->Type) { + case IVRS_TYPE_HARDWARE_LEGACY: /* Legacy */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + if (!iter(ivrs_hdr, arg)) + return; + break; + + case ACPI_IVRS_TYPE_MEMORY1: + case ACPI_IVRS_TYPE_MEMORY2: + case ACPI_IVRS_TYPE_MEMORY3: + if (!iter(ivrs_hdr, arg)) + return; + + break; + + default: + printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type); + + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr + + ivrs_hdr->Length); + } +} + +static bool +ivrs_is_ivhd(UINT8 type) +{ + + switch(type) { + case IVRS_TYPE_HARDWARE_LEGACY: + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + return (true); + + default: + return (false); + } +} + +/* Count the number of AMD-Vi devices in the system. */ +static int +ivhd_count_iter(ACPI_IVRS_HEADER * ivrs_he, void *arg) +{ + + if (ivrs_is_ivhd(ivrs_he->Type)) + ivhd_count++; + + return (1); +} + +struct find_ivrs_hdr_args { + int i; + ACPI_IVRS_HEADER *ptr; +}; + +static int +ivrs_hdr_find_iter(ACPI_IVRS_HEADER * ivrs_hdr, void *args) +{ + struct find_ivrs_hdr_args *fi; + + fi = (struct find_ivrs_hdr_args *)args; + if (ivrs_is_ivhd(ivrs_hdr->Type)) { + if (fi->i == 0) { + fi->ptr = ivrs_hdr; + return (0); + } + fi->i--; + } + + return (1); +} + +static ACPI_IVRS_HARDWARE * +ivhd_find_by_index(int idx) +{ + struct find_ivrs_hdr_args fi; + + fi.i = idx; + fi.ptr = NULL; + + ivrs_hdr_iterate_tbl(ivrs_hdr_find_iter, &fi); + + return ((ACPI_IVRS_HARDWARE *)fi.ptr); +} + +static void +ivhd_dev_add_entry(struct amdvi_softc *softc, uint32_t start_id, + uint32_t end_id, uint8_t cfg, bool ats) +{ + struct ivhd_dev_cfg *dev_cfg; + + /* If device doesn't have special data, don't add it. */ + if (!cfg) + return; + + dev_cfg = &softc->dev_cfg[softc->dev_cfg_cnt++]; + dev_cfg->start_id = start_id; + dev_cfg->end_id = end_id; + dev_cfg->data = cfg; + dev_cfg->enable_ats = ats; +} + +/* + * Record device attributes as suggested by BIOS. + */ +static int +ivhd_dev_parse(ACPI_IVRS_HARDWARE* ivhd, struct amdvi_softc *softc) +{ + ACPI_IVRS_DE_HEADER *de; + uint8_t *p, *end; + int range_start_id = 0, range_end_id = 0; + uint32_t *extended; + uint8_t all_data = 0, range_data = 0; + bool range_enable_ats = false, enable_ats; + + softc->start_dev_rid = ~0; + softc->end_dev_rid = 0; + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_LEGACY: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE_EFRSUP); + break; + + default: + device_printf(softc->dev, + "unknown type: 0x%x\n", ivhd->Header.Type); + return (-1); + } + + end = (uint8_t *)ivhd + ivhd->Header.Length; + + while (p < end) { + de = (ACPI_IVRS_DE_HEADER *)p; + softc->start_dev_rid = MIN(softc->start_dev_rid, de->Id); + softc->end_dev_rid = MAX(softc->end_dev_rid, de->Id); + switch (de->Type) { + case ACPI_IVRS_TYPE_ALL: + all_data = de->DataSetting; + break; + + case ACPI_IVRS_TYPE_SELECT: + case ACPI_IVRS_TYPE_ALIAS_SELECT: + case ACPI_IVRS_TYPE_EXT_SELECT: + enable_ats = false; + if (de->Type == ACPI_IVRS_TYPE_EXT_SELECT) { + extended = (uint32_t *)(de + 1); + enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + ivhd_dev_add_entry(softc, de->Id, de->Id, + de->DataSetting | all_data, enable_ats); + break; + + case ACPI_IVRS_TYPE_START: + case ACPI_IVRS_TYPE_ALIAS_START: + case ACPI_IVRS_TYPE_EXT_START: + range_start_id = de->Id; + range_data = de->DataSetting; + if (de->Type == ACPI_IVRS_TYPE_EXT_START) { + extended = (uint32_t *)(de + 1); + range_enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + break; + + case ACPI_IVRS_TYPE_END: + range_end_id = de->Id; + ivhd_dev_add_entry(softc, range_start_id, range_end_id, + range_data | all_data, range_enable_ats); + range_start_id = range_end_id = 0; + range_data = 0; + all_data = 0; + break; + + case ACPI_IVRS_TYPE_PAD4: + break; + + case ACPI_IVRS_TYPE_SPECIAL: + /* HPET or IOAPIC */ + break; + default: + if ((de->Type < 5) || + (de->Type >= ACPI_IVRS_TYPE_PAD8)) + device_printf(softc->dev, + "Unknown dev entry:0x%x\n", de->Type); + } + + if (softc->dev_cfg_cnt > + (sizeof(softc->dev_cfg) / sizeof(softc->dev_cfg[0]))) { + device_printf(softc->dev, + "WARN Too many device entries.\n"); + return (EINVAL); + } + if (de->Type < 0x40) + p += sizeof(ACPI_IVRS_DEVICE4); + else if (de->Type < 0x80) + p += sizeof(ACPI_IVRS_DEVICE8A); + else { + printf("Variable size IVHD type 0x%x not supported\n", + de->Type); + break; + } + } + + KASSERT((softc->end_dev_rid >= softc->start_dev_rid), + ("Device end[0x%x] < start[0x%x.\n", + softc->end_dev_rid, softc->start_dev_rid)); + + return (0); +} + +static bool +ivhd_is_newer(ACPI_IVRS_HEADER *old, ACPI_IVRS_HEADER *new) +{ + /* + * Newer IVRS header type take precedence. + */ + if ((old->DeviceId == new->DeviceId) && + (old->Type == IVRS_TYPE_HARDWARE_LEGACY) && + ((new->Type == IVRS_TYPE_HARDWARE_EFR) || + (new->Type == IVRS_TYPE_HARDWARE_MIXED))) { + return (true); + } + + return (false); +} + +static void +ivhd_identify(driver_t *driver, device_t parent) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HARDWARE *ivhd; + ACPI_STATUS status; + int i, count = 0; + uint32_t ivrs_ivinfo; + + if (acpi_disabled("ivhd")) + return; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_ivinfo = ivrs->Info; + printf("AMD-Vi: IVRS Info VAsize = %d PAsize = %d GVAsize = %d" + " flags:%b\n", + REG_BITS(ivrs_ivinfo, 21, 15), REG_BITS(ivrs_ivinfo, 14, 8), + REG_BITS(ivrs_ivinfo, 7, 5), REG_BITS(ivrs_ivinfo, 22, 22), + "\020\001EFRSup"); + + ivrs_hdr_iterate_tbl(ivhd_count_iter, NULL); + if (!ivhd_count) + return; + + for (i = 0; i < ivhd_count; i++) { + ivhd = ivhd_find_by_index(i); + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + ivhd_hdrs[i] = ivhd; + } + + /* + * Scan for presence of legacy and non-legacy device type + * for same AMD-Vi device and override the old one. + */ + for (i = ivhd_count - 1 ; i > 0 ; i--){ + if (ivhd_is_newer(&ivhd_hdrs[i-1]->Header, + &ivhd_hdrs[i]->Header)) { + ivhd_hdrs[i-1] = ivhd_hdrs[i]; + ivhd_count--; + } + } + + ivhd_devs = malloc(sizeof(device_t) * ivhd_count, M_DEVBUF, + M_WAITOK | M_ZERO); + for (i = 0; i < ivhd_count; i++) { + ivhd = ivhd_hdrs[i]; + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + + /* + * Use a high order to ensure that this driver is probed after + * the Host-PCI bridge and the root PCI bus. + */ + ivhd_devs[i] = BUS_ADD_CHILD(parent, + ACPI_DEV_BASE_ORDER + 10 * 10, "ivhd", i); + + /* + * XXX: In case device was not destroyed before, add will fail. + * locate the old device instance. + */ + if (ivhd_devs[i] == NULL) { + ivhd_devs[i] = device_find_child(parent, "ivhd", i); + if (ivhd_devs[i] == NULL) { + printf("AMD-Vi: cant find ivhd%d\n", i); + break; + } + } + count++; + } + + /* + * Update device count in case failed to attach. + */ + ivhd_count = count; +} + +static int +ivhd_probe(device_t dev) +{ + ACPI_IVRS_HARDWARE *ivhd; + int unit; + + if (acpi_get_handle(dev) != NULL) + return (ENXIO); + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd with EFR"); + break; + + case IVRS_TYPE_HARDWARE_MIXED: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd in mixed format"); + break; + + case IVRS_TYPE_HARDWARE_LEGACY: + default: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd"); + break; + } + + return (BUS_PROBE_NOWILDCARD); +} + +static void +ivhd_print_flag(device_t dev, enum IvrsType ivhd_type, uint8_t flag) +{ + /* + * IVHD lgeacy type has two extra high bits in flag which has + * been moved to EFR for non-legacy device. + */ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent" + "\007PreFSup" + "\008PPRSup"); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent"); + break; + + default: + device_printf(dev, "Can't decode flag of ivhd type :0x%x\n", + ivhd_type); + break; + } +} + +/* + * Feature in legacy IVHD type(0x10) and attribute in newer type(0x11 and 0x40). + */ +static void +ivhd_print_feature(device_t dev, enum IvrsType ivhd_type, uint32_t feature) +{ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Features(type:0x%x) HATS = %d GATS = %d" + " MsiNumPPR = %d PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 31, 30), + REG_BITS(feature, 29, 28), + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + device_printf(dev, "max PASID = %d GLXSup = %d Feature:%b\n", + REG_BITS(feature, 12, 8), + REG_BITS(feature, 4, 3), + feature, + "\020" + "\002NXSup" + "\003GTSup" + "\004<b4>" + "\005IASup" + "\006GASup" + "\007HESup"); + break; + + /* Fewer features or attributes are reported in non-legacy type. */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Features(type:0x%x) MsiNumPPR = %d" + " PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + break; + + default: /* Other ivhd type features are not decoded. */ + device_printf(dev, "Can't decode ivhd type :0x%x\n", ivhd_type); + } +} + +/* Print extended features of IOMMU. */ +static void +ivhd_print_ext_feature(device_t dev, uint64_t ext_feature) +{ + uint32_t ext_low, ext_high; + + if (!ext_feature) + return; + + ext_low = ext_feature; + device_printf(dev, "Extended features[31:0]:%b " + "HATS = 0x%x GATS = 0x%x " + "GLXSup = 0x%x SmiFSup = 0x%x SmiFRC = 0x%x " + "GAMSup = 0x%x DualPortLogSup = 0x%x DualEventLogSup = 0x%x\n", + (int)ext_low, + "\020" + "\001PreFSup" + "\002PPRSup" + "\003<b2>" + "\004NXSup" + "\005GTSup" + "\006<b5>" + "\007IASup" + "\008GASup" + "\009HESup" + "\010PCSup", + REG_BITS(ext_low, 11, 10), + REG_BITS(ext_low, 13, 12), + REG_BITS(ext_low, 15, 14), + REG_BITS(ext_low, 17, 16), + REG_BITS(ext_low, 20, 18), + REG_BITS(ext_low, 23, 21), + REG_BITS(ext_low, 25, 24), + REG_BITS(ext_low, 29, 28)); + + ext_high = ext_feature >> 32; + device_printf(dev, "Extended features[62:32]:%b " + "Max PASID: 0x%x DevTblSegSup = 0x%x " + "MarcSup = 0x%x\n", + (int)(ext_high), + "\020" + "\006USSup" + "\009PprOvrflwEarlySup" + "\010PPRAutoRspSup" + "\013BlKStopMrkSup" + "\014PerfOptSup" + "\015MsiCapMmioSup" + "\017GIOSup" + "\018HASup" + "\019EPHSup" + "\020AttrFWSup" + "\021HDSup" + "\023InvIotlbSup", + REG_BITS(ext_high, 5, 0), + REG_BITS(ext_high, 8, 7), + REG_BITS(ext_high, 11, 10)); +} + +static int +ivhd_print_cap(struct amdvi_softc *softc, ACPI_IVRS_HARDWARE * ivhd) +{ + device_t dev; + int max_ptp_level; + + dev = softc->dev; + + ivhd_print_flag(dev, softc->ivhd_type, softc->ivhd_flag); + ivhd_print_feature(dev, softc->ivhd_type, softc->ivhd_feature); + ivhd_print_ext_feature(dev, softc->ext_feature); + max_ptp_level = 7; + /* Make sure device support minimum page level as requested by user. */ + if (max_ptp_level < amdvi_ptp_level) { + device_printf(dev, "insufficient PTP level:%d\n", + max_ptp_level); + return (EINVAL); + } else { + device_printf(softc->dev, "supported paging level:%d, will use only: %d\n", + max_ptp_level, amdvi_ptp_level); + } + + device_printf(softc->dev, "device range: 0x%x - 0x%x\n", + softc->start_dev_rid, softc->end_dev_rid); + + return (0); +} + +static int +ivhd_attach(device_t dev) +{ + ACPI_IVRS_HARDWARE *ivhd; + ACPI_IVRS_HARDWARE_EFRSUP *ivhd_efr; + struct amdvi_softc *softc; + int status, unit; + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + /* Make sure its same device for which attach is called. */ + KASSERT((ivhd_devs[unit] == dev), + ("Not same device old %p new %p", ivhd_devs[unit], dev)); + + softc = device_get_softc(dev); + softc->dev = dev; + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + + softc->ivhd_type = ivhd->Header.Type; + softc->pci_seg = ivhd->PciSegmentGroup; + softc->pci_rid = ivhd->Header.DeviceId; + softc->ivhd_flag = ivhd->Header.Flags; + /* + * On lgeacy IVHD type(0x10), it is documented as feature + * but in newer type it is attribute. + */ + softc->ivhd_feature = ivhd->Reserved; + /* + * PCI capability has more capabilities that are not part of IVRS. + */ + softc->cap_off = ivhd->CapabilityOffset; + +#ifdef notyet + /* IVHD Info bit[4:0] is event MSI/X number. */ + softc->event_msix = ivhd->Info & 0x1F; +#endif + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + ivhd_efr = (ACPI_IVRS_HARDWARE_EFRSUP *)ivhd; + softc->ext_feature = ivhd_efr->ExtFR; + break; + + } + + softc->ctrl = (struct amdvi_ctrl *) PHYS_TO_DMAP(ivhd->BaseAddress); + status = ivhd_dev_parse(ivhd, softc); + if (status != 0) { + device_printf(dev, + "endpoint device parsing error=%d\n", status); + } + + status = ivhd_print_cap(softc, ivhd); + if (status != 0) { + return (status); + } + + status = amdvi_setup_hw(softc); + if (status != 0) { + device_printf(dev, "couldn't be initialised, error=%d\n", + status); + return (status); + } + + return (0); +} + +static int +ivhd_detach(device_t dev) +{ + struct amdvi_softc *softc; + + softc = device_get_softc(dev); + + amdvi_teardown_hw(softc); + + /* + * XXX: delete the device. + * don't allow detach, return EBUSY. + */ + return (0); +} + +static int +ivhd_suspend(device_t dev) +{ + + return (0); +} + +static int +ivhd_resume(device_t dev) +{ + + return (0); +} + +static device_method_t ivhd_methods[] = { + DEVMETHOD(device_identify, ivhd_identify), + DEVMETHOD(device_probe, ivhd_probe), + DEVMETHOD(device_attach, ivhd_attach), + DEVMETHOD(device_detach, ivhd_detach), + DEVMETHOD(device_suspend, ivhd_suspend), + DEVMETHOD(device_resume, ivhd_resume), + DEVMETHOD_END +}; + +static driver_t ivhd_driver = { + "ivhd", + ivhd_methods, + sizeof(struct amdvi_softc), +}; + +static devclass_t ivhd_devclass; + +/* + * Load this module at the end after PCI re-probing to configure interrupt. + */ +DRIVER_MODULE_ORDERED(ivhd, acpi, ivhd_driver, ivhd_devclass, 0, 0, + SI_ORDER_ANY); +MODULE_DEPEND(ivhd, acpi, 1, 1, 1); +MODULE_DEPEND(ivhd, pci, 1, 1, 1); diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.c b/usr/src/uts/i86pc/io/vmm/amd/npt.c new file mode 100644 index 0000000000..e61464a964 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/npt.c @@ -0,0 +1,87 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> + +#include "npt.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW, NULL, NULL); + +static int npt_flags; +SYSCTL_INT(_hw_vmm_npt, OID_AUTO, pmap_flags, CTLFLAG_RD, + &npt_flags, 0, NULL); + +#define NPT_IPIMASK 0xFF + +/* + * AMD nested page table init. + */ +int +svm_npt_init(int ipinum) +{ + int enable_superpage = 1; + + npt_flags = ipinum & NPT_IPIMASK; + TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage); + if (enable_superpage) + npt_flags |= PMAP_PDE_SUPERPAGE; + + return (0); +} + +static int +npt_pinit(pmap_t pmap) +{ + + return (pmap_pinit_type(pmap, PT_RVI, npt_flags)); +} + +struct vmspace * +svm_npt_alloc(vm_offset_t min, vm_offset_t max) +{ + + return (vmspace_alloc(min, max, npt_pinit)); +} + +void +svm_npt_free(struct vmspace *vmspace) +{ + + vmspace_free(vmspace); +} diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.h b/usr/src/uts/i86pc/io/vmm/amd/npt.h new file mode 100644 index 0000000000..35530d7833 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/npt.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_NPT_H_ +#define _SVM_NPT_H_ + +int svm_npt_init(int ipinum); +struct vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max); +void svm_npt_free(struct vmspace *vmspace); + +#endif /* _SVM_NPT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/offsets.in b/usr/src/uts/i86pc/io/vmm/amd/offsets.in new file mode 100644 index 0000000000..f8d2a716d7 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/offsets.in @@ -0,0 +1,36 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ +#include <sys/types.h> + +#include "amd/svm.h" + +svm_regctx + sctx_rbx SCTX_RBX + sctx_rcx SCTX_RCX + sctx_rbp SCTX_RBP + sctx_rdx SCTX_RDX + sctx_rdi SCTX_RDI + sctx_rsi SCTX_RSI + sctx_r8 SCTX_R8 + sctx_r9 SCTX_R9 + sctx_r10 SCTX_R10 + sctx_r11 SCTX_R11 + sctx_r12 SCTX_R12 + sctx_r13 SCTX_R13 + sctx_r14 SCTX_R14 + sctx_r15 SCTX_R15 + +/* Pull in definition for MSR_GSBASE */ +\#include <machine/specialreg.h> diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c new file mode 100644 index 0000000000..25dc3a63fa --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c @@ -0,0 +1,2446 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sysctl.h> + +#ifndef __FreeBSD__ +#include <sys/x86_archext.h> +#include <sys/trap.h> +#endif + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/cpufunc.h> +#include <machine/psl.h> +#include <machine/md_var.h> +#include <machine/reg.h> +#include <machine/specialreg.h> +#include <machine/smp.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> + +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_ktr.h" +#include "vmm_ioport.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "x86.h" +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" +#include "svm_msr.h" +#include "npt.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL); + +/* + * SVM CPUID function 0x8000_000A, edx bit decoding. + */ +#define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ +#define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ +#define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ +#define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ +#define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ +#define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ +#define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ +#define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ +#define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ +#define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ +#define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ + +#define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ + VMCB_CACHE_IOPM | \ + VMCB_CACHE_I | \ + VMCB_CACHE_TPR | \ + VMCB_CACHE_CR2 | \ + VMCB_CACHE_CR | \ + VMCB_CACHE_DR | \ + VMCB_CACHE_DT | \ + VMCB_CACHE_SEG | \ + VMCB_CACHE_NP) + +static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, + 0, NULL); + +static MALLOC_DEFINE(M_SVM, "svm", "svm"); +static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); + +#ifdef __FreeBSD__ +/* Per-CPU context area. */ +extern struct pcpu __pcpu[]; +#endif + +static uint32_t svm_feature = ~0U; /* AMD SVM features. */ +SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0, + "SVM features advertised by CPUID.8000000AH:EDX"); + +static int disable_npf_assist; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, + &disable_npf_assist, 0, NULL); + +#ifdef __FreeBSD__ +/* Maximum ASIDs supported by the processor */ +static uint32_t nasid; +SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0, + "Number of ASIDs supported by this processor"); + +/* Current ASID generation for each host cpu */ +static struct asid asid[MAXCPU]; + +/* + * SVM host state saved area of size 4KB for each core. + */ +static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); +#endif /* __FreeBSD__ */ + +static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); +static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); +static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); + +static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); + +static __inline int +flush_by_asid(void) +{ + + return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); +} + +static __inline int +decode_assist(void) +{ + + return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); +} + +#ifdef __FreeBSD__ +static void +svm_disable(void *arg __unused) +{ + uint64_t efer; + + efer = rdmsr(MSR_EFER); + efer &= ~EFER_SVM; + wrmsr(MSR_EFER, efer); +} + +/* + * Disable SVM on all CPUs. + */ +static int +svm_cleanup(void) +{ + + smp_rendezvous(NULL, svm_disable, NULL, NULL); + return (0); +} + +/* + * Verify that all the features required by bhyve are available. + */ +static int +check_svm_features(void) +{ + u_int regs[4]; + + /* CPUID Fn8000_000A is for SVM */ + do_cpuid(0x8000000A, regs); + svm_feature &= regs[3]; + + /* + * The number of ASIDs can be configured to be less than what is + * supported by the hardware but not more. + */ + if (nasid == 0 || nasid > regs[1]) + nasid = regs[1]; + KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); + + /* bhyve requires the Nested Paging feature */ + if (!(svm_feature & AMD_CPUID_SVM_NP)) { + printf("SVM: Nested Paging feature not available.\n"); + return (ENXIO); + } + + /* bhyve requires the NRIP Save feature */ + if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) { + printf("SVM: NRIP Save feature not available.\n"); + return (ENXIO); + } + + return (0); +} + +static void +svm_enable(void *arg __unused) +{ + uint64_t efer; + + efer = rdmsr(MSR_EFER); + efer |= EFER_SVM; + wrmsr(MSR_EFER, efer); + + wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu])); +} + +/* + * Return 1 if SVM is enabled on this processor and 0 otherwise. + */ +static int +svm_available(void) +{ + uint64_t msr; + +#ifdef __FreeBSD__ + /* Section 15.4 Enabling SVM from APM2. */ + if ((amd_feature2 & AMDID2_SVM) == 0) { + printf("SVM: not available.\n"); + return (0); + } +#else + if (!is_x86_feature(x86_featureset, X86FSET_SVM)) { + cmn_err(CE_WARN, "processor does not support SVM operation\n"); + return (0); + } +#endif + + msr = rdmsr(MSR_VM_CR); + if ((msr & VM_CR_SVMDIS) != 0) { +#ifdef __FreeBSD__ + printf("SVM: disabled by BIOS.\n"); +#else + cmn_err(CE_WARN, "SVM disabled by BIOS.\n"); +#endif + return (0); + } + + return (1); +} + +static int +svm_init(int ipinum) +{ + int error, cpu; + + if (!svm_available()) + return (ENXIO); + + error = check_svm_features(); + if (error) + return (error); + + vmcb_clean &= VMCB_CACHE_DEFAULT; + + for (cpu = 0; cpu < MAXCPU; cpu++) { + /* + * Initialize the host ASIDs to their "highest" valid values. + * + * The next ASID allocation will rollover both 'gen' and 'num' + * and start off the sequence at {1,1}. + */ + asid[cpu].gen = ~0UL; + asid[cpu].num = nasid - 1; + } + + svm_msr_init(); + svm_npt_init(ipinum); + + /* Enable SVM on all CPUs */ + smp_rendezvous(NULL, svm_enable, NULL, NULL); + + return (0); +} + +static void +svm_restore(void) +{ + + svm_enable(NULL); +} +#else /* __FreeBSD__ */ +static int +svm_cleanup(void) +{ + /* This is taken care of by the hma registration */ + return (0); +} + +static int +svm_init(int ipinum) +{ + vmcb_clean &= VMCB_CACHE_DEFAULT; + + svm_msr_init(); + svm_npt_init(ipinum); + + return (0); +} + +static void +svm_restore(void) +{ + /* No-op on illumos */ +} +#endif /* __FreeBSD__ */ + +/* Pentium compatible MSRs */ +#define MSR_PENTIUM_START 0 +#define MSR_PENTIUM_END 0x1FFF +/* AMD 6th generation and Intel compatible MSRs */ +#define MSR_AMD6TH_START 0xC0000000UL +#define MSR_AMD6TH_END 0xC0001FFFUL +/* AMD 7th and 8th generation compatible MSRs */ +#define MSR_AMD7TH_START 0xC0010000UL +#define MSR_AMD7TH_END 0xC0011FFFUL + +/* + * Get the index and bit position for a MSR in permission bitmap. + * Two bits are used for each MSR: lower bit for read and higher bit for write. + */ +static int +svm_msr_index(uint64_t msr, int *index, int *bit) +{ + uint32_t base, off; + + *index = -1; + *bit = (msr % 4) * 2; + base = 0; + + if (msr <= MSR_PENTIUM_END) { + *index = msr / 4; + return (0); + } + + base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); + if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { + off = (msr - MSR_AMD6TH_START); + *index = (off + base) / 4; + return (0); + } + + base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); + if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { + off = (msr - MSR_AMD7TH_START); + *index = (off + base) / 4; + return (0); + } + + return (EINVAL); +} + +/* + * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. + */ +static void +svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) +{ + int index, bit, error; + + error = svm_msr_index(msr, &index, &bit); + KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr)); + KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, + ("%s: invalid index %d for msr %#lx", __func__, index, msr)); + KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " + "msr %#lx", __func__, bit, msr)); + + if (read) + perm_bitmap[index] &= ~(1UL << bit); + + if (write) + perm_bitmap[index] &= ~(2UL << bit); +} + +static void +svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, true); +} + +static void +svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, false); +} + +static __inline int +svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) +{ + struct vmcb_ctrl *ctrl; + + KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + return (ctrl->intercept[idx] & bitmask ? 1 : 0); +} + +static __inline void +svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, + int enabled) +{ + struct vmcb_ctrl *ctrl; + uint32_t oldval; + + KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + oldval = ctrl->intercept[idx]; + + if (enabled) + ctrl->intercept[idx] |= bitmask; + else + ctrl->intercept[idx] &= ~bitmask; + + if (ctrl->intercept[idx] != oldval) { + svm_set_dirty(sc, vcpu, VMCB_CACHE_I); + VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " + "from %#x to %#x", idx, oldval, ctrl->intercept[idx]); + } +} + +static __inline void +svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(sc, vcpu, off, bitmask, 0); +} + +static __inline void +svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(sc, vcpu, off, bitmask, 1); +} + +static void +vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, + uint64_t msrpm_base_pa, uint64_t np_pml4) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + uint32_t mask; + int n; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + state = svm_get_vmcb_state(sc, vcpu); + + ctrl->iopm_base_pa = iopm_base_pa; + ctrl->msrpm_base_pa = msrpm_base_pa; + + /* Enable nested paging */ + ctrl->np_enable = 1; + ctrl->n_cr3 = np_pml4; + + /* + * Intercept accesses to the control registers that are not shadowed + * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. + */ + for (n = 0; n < 16; n++) { + mask = (BIT(n) << 16) | BIT(n); + if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) + svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); + else + svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); + } + + + /* + * Intercept everything when tracing guest exceptions otherwise + * just intercept machine check exception. + */ + if (vcpu_trace_exceptions(sc->vm, vcpu)) { + for (n = 0; n < 32; n++) { + /* + * Skip unimplemented vectors in the exception bitmap. + */ + if (n == 2 || n == 9) { + continue; + } + svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); + } + } else { + svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); + } + + /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_FERR_FREEZE); + + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); + + /* + * From section "Canonicalization and Consistency Checks" in APMv2 + * the VMRUN intercept bit must be set to pass the consistency check. + */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); + + /* + * The ASID will be set to a non-zero value just before VMRUN. + */ + ctrl->asid = 0; + + /* + * Section 15.21.1, Interrupt Masking in EFLAGS + * Section 15.21.2, Virtualizing APIC.TPR + * + * This must be set for %rflag and %cr8 isolation of guest and host. + */ + ctrl->v_intr_masking = 1; + + /* Enable Last Branch Record aka LBR for debugging */ + ctrl->lbr_virt_en = 1; + state->dbgctl = BIT(0); + + /* EFER_SVM must always be set when the guest is executing */ + state->efer = EFER_SVM; + + /* Set up the PAT to power-on state */ + state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + + /* Set up DR6/7 to power-on state */ + state->dr6 = DBREG_DR6_RESERVED1; + state->dr7 = DBREG_DR7_RESERVED1; +} + +/* + * Initialize a virtual machine. + */ +static void * +svm_vminit(struct vm *vm, pmap_t pmap) +{ + struct svm_softc *svm_sc; + struct svm_vcpu *vcpu; + vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; + int i; + uint16_t maxcpus; + + svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO); + if (((uintptr_t)svm_sc & PAGE_MASK) != 0) + panic("malloc of svm_softc not aligned on page boundary"); + + svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM, + M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); + if (svm_sc->msr_bitmap == NULL) + panic("contigmalloc of SVM MSR bitmap failed"); + svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM, + M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); + if (svm_sc->iopm_bitmap == NULL) + panic("contigmalloc of SVM IO bitmap failed"); + + svm_sc->vm = vm; + svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); + + /* + * Intercept read and write accesses to all MSRs. + */ + memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE); + + /* + * Access to the following MSRs is redirected to the VMCB when the + * guest is executing. Therefore it is safe to allow the guest to + * read/write these MSRs directly without hypervisor involvement. + */ + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); + + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); + + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); + + /* + * Intercept writes to make sure that the EFER_SVM bit is not cleared. + */ + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); + + /* Intercept access to all I/O ports. */ + memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE); + + iopm_pa = vtophys(svm_sc->iopm_bitmap); + msrpm_pa = vtophys(svm_sc->msr_bitmap); + pml4_pa = svm_sc->nptp; + maxcpus = vm_get_maxcpus(svm_sc->vm); + for (i = 0; i < maxcpus; i++) { + vcpu = svm_get_vcpu(svm_sc, i); + vcpu->nextrip = ~0; + vcpu->lastcpu = NOCPU; + vcpu->vmcb_pa = vtophys(&vcpu->vmcb); + vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); + svm_msr_guest_init(svm_sc, i); + } + return (svm_sc); +} + +/* + * Collateral for a generic SVM VM-exit. + */ +static void +vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) +{ + + vme->exitcode = VM_EXITCODE_SVM; + vme->u.svm.exitcode = code; + vme->u.svm.exitinfo1 = info1; + vme->u.svm.exitinfo2 = info2; +} + +static int +svm_cpl(struct vmcb_state *state) +{ + + /* + * From APMv2: + * "Retrieve the CPL from the CPL field in the VMCB, not + * from any segment DPL" + */ + return (state->cpl); +} + +static enum vm_cpu_mode +svm_vcpu_mode(struct vmcb *vmcb) +{ + struct vmcb_segment seg; + struct vmcb_state *state; + int error; + + state = &vmcb->state; + + if (state->efer & EFER_LMA) { + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__, + error)); + + /* + * Section 4.8.1 for APM2, check if Code Segment has + * Long attribute set in descriptor. + */ + if (seg.attrib & VMCB_CS_ATTRIB_L) + return (CPU_MODE_64BIT); + else + return (CPU_MODE_COMPATIBILITY); + } else if (state->cr0 & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) +{ + + if ((cr0 & CR0_PG) == 0) + return (PAGING_MODE_FLAT); + if ((cr4 & CR4_PAE) == 0) + return (PAGING_MODE_32); + if (efer & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +/* + * ins/outs utility routines + */ +static uint64_t +svm_inout_str_index(struct svm_regctx *regs, int in) +{ + uint64_t val; + + val = in ? regs->sctx_rdi : regs->sctx_rsi; + + return (val); +} + +static uint64_t +svm_inout_str_count(struct svm_regctx *regs, int rep) +{ + uint64_t val; + + val = rep ? regs->sctx_rcx : 1; + + return (val); +} + +static void +svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, + int in, struct vm_inout_str *vis) +{ + int error, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else { + /* The segment field has standard encoding */ + s = (info1 >> 10) & 0x7; + vis->seg_name = vm_segment_name(s); + } + + error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); +} + +static int +svm_inout_str_addrsize(uint64_t info1) +{ + uint32_t size; + + size = (info1 >> 7) & 0x7; + switch (size) { + case 1: + return (2); /* 16 bit */ + case 2: + return (4); /* 32 bit */ + case 4: + return (8); /* 64 bit */ + default: + panic("%s: invalid size encoding %d", __func__, size); + } +} + +static void +svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) +{ + struct vmcb_state *state; + + state = &vmcb->state; + paging->cr3 = state->cr3; + paging->cpl = svm_cpl(state); + paging->cpu_mode = svm_vcpu_mode(vmcb); + paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, + state->efer); +} + +#define UNHANDLED 0 + +/* + * Handle guest I/O intercept. + */ +static int +svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct svm_regctx *regs; + struct vm_inout_str *vis; + uint64_t info1; + int inout_string; + + state = svm_get_vmcb_state(svm_sc, vcpu); + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + regs = svm_get_guest_regctx(svm_sc, vcpu); + + info1 = ctrl->exitinfo1; + inout_string = info1 & BIT(2) ? 1 : 0; + + /* + * The effective segment number in EXITINFO1[12:10] is populated + * only if the processor has the DecodeAssist capability. + * + * XXX this is not specified explicitly in APMv2 but can be verified + * empirically. + */ + if (inout_string && !decode_assist()) + return (UNHANDLED); + + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; + vmexit->u.inout.string = inout_string; + vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; + vmexit->u.inout.bytes = (info1 >> 4) & 0x7; + vmexit->u.inout.port = (uint16_t)(info1 >> 16); + vmexit->u.inout.eax = (uint32_t)(state->rax); + + if (inout_string) { + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); + vis->rflags = state->rflags; + vis->cr0 = state->cr0; + vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); + vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); + vis->addrsize = svm_inout_str_addrsize(info1); + svm_inout_str_seginfo(svm_sc, vcpu, info1, + vmexit->u.inout.in, vis); + } + + return (UNHANDLED); +} + +static int +npf_fault_type(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_W) + return (VM_PROT_WRITE); + else if (exitinfo1 & VMCB_NPF_INFO1_ID) + return (VM_PROT_EXECUTE); + else + return (VM_PROT_READ); +} + +static bool +svm_npf_emul_fault(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_ID) { + return (false); + } + + if (exitinfo1 & VMCB_NPF_INFO1_GPT) { + return (false); + } + + if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { + return (false); + } + + return (true); +} + +static void +svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) +{ + struct vm_guest_paging *paging; + struct vmcb_segment seg; + struct vmcb_ctrl *ctrl; + char *inst_bytes; + int error, inst_len; + + ctrl = &vmcb->ctrl; + paging = &vmexit->u.inst_emul.paging; + + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = VIE_INVALID_GLA; + svm_paging_info(vmcb, paging); + + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); + + switch(paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = seg.base; + vmexit->u.inst_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = seg.base; + + /* + * Section 4.8.1 of APM2, Default Operand Size or D bit. + */ + vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? + 1 : 0; + break; + default: + vmexit->u.inst_emul.cs_base = 0; + vmexit->u.inst_emul.cs_d = 0; + break; + } + + /* + * Copy the instruction bytes into 'vie' if available. + */ + if (decode_assist() && !disable_npf_assist) { + inst_len = ctrl->inst_len; + inst_bytes = (char *)ctrl->inst_bytes; + } else { + inst_len = 0; + inst_bytes = NULL; + } + vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); +} + +#ifdef KTR +static const char * +intrtype_to_str(int intr_type) +{ + switch (intr_type) { + case VMCB_EVENTINJ_TYPE_INTR: + return ("hwintr"); + case VMCB_EVENTINJ_TYPE_NMI: + return ("nmi"); + case VMCB_EVENTINJ_TYPE_INTn: + return ("swintr"); + case VMCB_EVENTINJ_TYPE_EXCEPTION: + return ("exception"); + default: + panic("%s: unknown intr_type %d", __func__, intr_type); + } +} +#endif + +/* + * Inject an event to vcpu as described in section 15.20, "Event injection". + */ +static void +svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector, + uint32_t error, bool ec_valid) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, + ("%s: event already pending %#lx", __func__, ctrl->eventinj)); + + KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", + __func__, vector)); + + switch (intr_type) { + case VMCB_EVENTINJ_TYPE_INTR: + case VMCB_EVENTINJ_TYPE_NMI: + case VMCB_EVENTINJ_TYPE_INTn: + break; + case VMCB_EVENTINJ_TYPE_EXCEPTION: + if (vector >= 0 && vector <= 31 && vector != 2) + break; + /* FALLTHROUGH */ + default: + panic("%s: invalid intr_type/vector: %d/%d", __func__, + intr_type, vector); + } + ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; + if (ec_valid) { + ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; + ctrl->eventinj |= (uint64_t)error << 32; + VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x", + intrtype_to_str(intr_type), vector, error); + } else { + VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d", + intrtype_to_str(intr_type), vector); + } +} + +static void +svm_update_virqinfo(struct svm_softc *sc, int vcpu) +{ + struct vm *vm; + struct vlapic *vlapic; + struct vmcb_ctrl *ctrl; + + vm = sc->vm; + vlapic = vm_lapic(vm, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + /* Update %cr8 in the emulated vlapic */ + vlapic_set_cr8(vlapic, ctrl->v_tpr); + + /* Virtual interrupt injection is not used. */ + KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid " + "v_intr_vector %d", __func__, ctrl->v_intr_vector)); +} + +static void +svm_save_intinfo(struct svm_softc *svm_sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + uint64_t intinfo; + + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + intinfo = ctrl->exitintinfo; + if (!VMCB_EXITINTINFO_VALID(intinfo)) + return; + + /* + * From APMv2, Section "Intercepts during IDT interrupt delivery" + * + * If a #VMEXIT happened during event delivery then record the event + * that was being delivered. + */ + VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", + intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); + vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); + vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); +} + +static __inline int +vintr_intercept_enabled(struct svm_softc *sc, int vcpu) +{ + + return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_VINTR)); +} + +static __inline void +enable_intr_window_exiting(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + if (ctrl->v_irq && ctrl->v_intr_vector == 0) { + KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); + KASSERT(vintr_intercept_enabled(sc, vcpu), + ("%s: vintr intercept should be enabled", __func__)); + return; + } + + VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); + ctrl->v_irq = 1; + ctrl->v_ign_tpr = 1; + ctrl->v_intr_vector = 0; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static __inline void +disable_intr_window_exiting(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + if (!ctrl->v_irq && ctrl->v_intr_vector == 0) { + KASSERT(!vintr_intercept_enabled(sc, vcpu), + ("%s: vintr intercept should be disabled", __func__)); + return; + } + + VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); + ctrl->v_irq = 0; + ctrl->v_intr_vector = 0; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static int +svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val) +{ + struct vmcb_ctrl *ctrl; + int oldval, newval; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + oldval = ctrl->intr_shadow; + newval = val ? 1 : 0; + if (newval != oldval) { + ctrl->intr_shadow = newval; + VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval); + } + return (0); +} + +static int +svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + *val = ctrl->intr_shadow; + return (0); +} + +/* + * Once an NMI is injected it blocks delivery of further NMIs until the handler + * executes an IRET. The IRET intercept is enabled when an NMI is injected to + * to track when the vcpu is done handling the NMI. + */ +static int +nmi_blocked(struct svm_softc *sc, int vcpu) +{ + int blocked; + + blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_IRET); + return (blocked); +} + +static void +enable_nmi_blocking(struct svm_softc *sc, int vcpu) +{ + + KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked")); + VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled"); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); +} + +static void +clear_nmi_blocking(struct svm_softc *sc, int vcpu) +{ + int error; + + KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked")); + VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); + /* + * When the IRET intercept is cleared the vcpu will attempt to execute + * the "iret" when it runs next. However, it is possible to inject + * another NMI into the vcpu before the "iret" has actually executed. + * + * For e.g. if the "iret" encounters a #NPF when accessing the stack + * it will trap back into the hypervisor. If an NMI is pending for + * the vcpu it will be injected into the guest. + * + * XXX this needs to be fixed + */ + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); + + /* + * Set 'intr_shadow' to prevent an NMI from being injected on the + * immediate VMRUN. + */ + error = svm_modify_intr_shadow(sc, vcpu, 1); + KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); +} + +#define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL + +static int +svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu) +{ + struct vm_exit *vme; + struct vmcb_state *state; + uint64_t changed, lma, oldval; + int error; + + state = svm_get_vmcb_state(sc, vcpu); + + oldval = state->efer; + VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval); + + newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ + changed = oldval ^ newval; + + if (newval & EFER_MBZ_BITS) + goto gpf; + + /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ + if (changed & EFER_LME) { + if (state->cr0 & CR0_PG) + goto gpf; + } + + /* EFER.LMA = EFER.LME & CR0.PG */ + if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) + lma = EFER_LMA; + else + lma = 0; + + if ((newval & EFER_LMA) != lma) + goto gpf; + + if (newval & EFER_NXE) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) + goto gpf; + } + + /* + * XXX bhyve does not enforce segment limits in 64-bit mode. Until + * this is fixed flag guest attempt to set EFER_LMSLE as an error. + */ + if (newval & EFER_LMSLE) { + vme = vm_exitinfo(sc->vm, vcpu); + vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0); + *retu = true; + return (0); + } + + if (newval & EFER_FFXSR) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) + goto gpf; + } + + if (newval & EFER_TCE) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) + goto gpf; + } + + error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); + KASSERT(error == 0, ("%s: error %d updating efer", __func__, error)); + return (0); +gpf: + vm_inject_gp(sc->vm, vcpu); + return (0); +} + +static int +emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, + bool *retu) +{ + int error; + + if (lapic_msr(num)) + error = lapic_wrmsr(sc->vm, vcpu, num, val, retu); + else if (num == MSR_EFER) + error = svm_write_efer(sc, vcpu, val, retu); + else + error = svm_wrmsr(sc, vcpu, num, val, retu); + + return (error); +} + +static int +emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu) +{ + struct vmcb_state *state; + struct svm_regctx *ctx; + uint64_t result; + int error; + + if (lapic_msr(num)) + error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu); + else + error = svm_rdmsr(sc, vcpu, num, &result, retu); + + if (error == 0) { + state = svm_get_vmcb_state(sc, vcpu); + ctx = svm_get_guest_regctx(sc, vcpu); + state->rax = result & 0xffffffff; + ctx->sctx_rdx = result >> 32; + } + + return (error); +} + +#ifdef KTR +static const char * +exit_reason_to_str(uint64_t reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case VMCB_EXIT_INVALID: + return ("invalvmcb"); + case VMCB_EXIT_SHUTDOWN: + return ("shutdown"); + case VMCB_EXIT_NPF: + return ("nptfault"); + case VMCB_EXIT_PAUSE: + return ("pause"); + case VMCB_EXIT_HLT: + return ("hlt"); + case VMCB_EXIT_CPUID: + return ("cpuid"); + case VMCB_EXIT_IO: + return ("inout"); + case VMCB_EXIT_MC: + return ("mchk"); + case VMCB_EXIT_INTR: + return ("extintr"); + case VMCB_EXIT_NMI: + return ("nmi"); + case VMCB_EXIT_VINTR: + return ("vintr"); + case VMCB_EXIT_MSR: + return ("msr"); + case VMCB_EXIT_IRET: + return ("iret"); + case VMCB_EXIT_MONITOR: + return ("monitor"); + case VMCB_EXIT_MWAIT: + return ("mwait"); + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); + return (reasonbuf); + } +} +#endif /* KTR */ + +/* + * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs + * that are due to instruction intercepts as well as MSR and IOIO intercepts + * and exceptions caused by INT3, INTO and BOUND instructions. + * + * Return 1 if the nRIP is valid and 0 otherwise. + */ +static int +nrip_valid(uint64_t exitcode) +{ + switch (exitcode) { + case 0x00 ... 0x0F: /* read of CR0 through CR15 */ + case 0x10 ... 0x1F: /* write of CR0 through CR15 */ + case 0x20 ... 0x2F: /* read of DR0 through DR15 */ + case 0x30 ... 0x3F: /* write of DR0 through DR15 */ + case 0x43: /* INT3 */ + case 0x44: /* INTO */ + case 0x45: /* BOUND */ + case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ + case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ + return (1); + default: + return (0); + } +} + +static int +svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct svm_regctx *ctx; + uint64_t code, info1, info2, val; + uint32_t eax, ecx, edx; +#ifdef __FreeBSD__ + int error, errcode_valid, handled, idtvec, reflect; +#else + int error, errcode_valid = 0, handled, idtvec, reflect; +#endif + bool retu; + + ctx = svm_get_guest_regctx(svm_sc, vcpu); + vmcb = svm_get_vmcb(svm_sc, vcpu); + state = &vmcb->state; + ctrl = &vmcb->ctrl; + + handled = 0; + code = ctrl->exitcode; + info1 = ctrl->exitinfo1; + info2 = ctrl->exitinfo2; + + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmexit->rip = state->rip; + vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; + + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); + + /* + * #VMEXIT(INVALID) needs to be handled early because the VMCB is + * in an inconsistent state and can trigger assertions that would + * never happen otherwise. + */ + if (code == VMCB_EXIT_INVALID) { + vm_exit_svm(vmexit, code, info1, info2); + return (0); + } + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " + "injection valid bit is set %#lx", __func__, ctrl->eventinj)); + + KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, + ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", + vmexit->inst_length, code, info1, info2)); + + svm_update_virqinfo(svm_sc, vcpu); + svm_save_intinfo(svm_sc, vcpu); + + switch (code) { + case VMCB_EXIT_IRET: + /* + * Restart execution at "iret" but with the intercept cleared. + */ + vmexit->inst_length = 0; + clear_nmi_blocking(svm_sc, vcpu); + handled = 1; + break; + case VMCB_EXIT_VINTR: /* interrupt window exiting */ + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); + handled = 1; + break; + case VMCB_EXIT_INTR: /* external interrupt */ + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); + handled = 1; + break; + case VMCB_EXIT_NMI: /* external NMI */ + handled = 1; + break; + case 0x40 ... 0x5F: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); + reflect = 1; + idtvec = code - 0x40; + switch (idtvec) { + case IDT_MC: + /* + * Call the machine check handler by hand. Also don't + * reflect the machine check back into the guest. + */ + reflect = 0; + VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); +#ifdef __FreeBSD__ + __asm __volatile("int $18"); +#else + vmm_call_trap(T_MCE); +#endif + break; + case IDT_PF: + error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, + info2); + KASSERT(error == 0, ("%s: error %d updating cr2", + __func__, error)); + /* fallthru */ + case IDT_NP: + case IDT_SS: + case IDT_GP: + case IDT_AC: + case IDT_TS: + errcode_valid = 1; + break; + + case IDT_DF: + errcode_valid = 1; + info1 = 0; + break; + + case IDT_BP: + case IDT_OF: + case IDT_BR: + /* + * The 'nrip' field is populated for INT3, INTO and + * BOUND exceptions and this also implies that + * 'inst_length' is non-zero. + * + * Reset 'inst_length' to zero so the guest %rip at + * event injection is identical to what it was when + * the exception originally happened. + */ + VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d " + "to zero before injecting exception %d", + vmexit->inst_length, idtvec); + vmexit->inst_length = 0; + /* fallthru */ + default: + errcode_valid = 0; + info1 = 0; + break; + } + KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " + "when reflecting exception %d into guest", + vmexit->inst_length, idtvec)); + + if (reflect) { + /* Reflect the exception back into the guest */ + VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " + "%d/%#x into the guest", idtvec, (int)info1); + error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, + errcode_valid, info1, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + } + handled = 1; + break; + case VMCB_EXIT_MSR: /* MSR access. */ + eax = state->rax; + ecx = ctx->sctx_rcx; + edx = ctx->sctx_rdx; + retu = false; + + if (info1) { + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); + val = (uint64_t)edx << 32 | eax; + VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx", + ecx, val); + if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = val; + } else if (!retu) { + handled = 1; + } else { + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } + } else { + VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); + if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } else if (!retu) { + handled = 1; + } else { + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_rdmsr retu with bogus exitcode")); + } + } + break; + case VMCB_EXIT_IO: + handled = svm_handle_io(svm_sc, vcpu, vmexit); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); + break; + case VMCB_EXIT_CPUID: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); + handled = x86_emulate_cpuid(svm_sc->vm, vcpu, + (uint32_t *)&state->rax, + (uint32_t *)&ctx->sctx_rbx, + (uint32_t *)&ctx->sctx_rcx, + (uint32_t *)&ctx->sctx_rdx); + break; + case VMCB_EXIT_HLT: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = state->rflags; + break; + case VMCB_EXIT_PAUSE: + vmexit->exitcode = VM_EXITCODE_PAUSE; + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); + break; + case VMCB_EXIT_NPF: + /* EXITINFO2 contains the faulting guest physical address */ + if (info1 & VMCB_NPF_INFO1_RSV) { + VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " + "reserved bits set: info1(%#lx) info2(%#lx)", + info1, info2); + } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->u.paging.gpa = info2; + vmexit->u.paging.fault_type = npf_fault_type(info1); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " + "on gpa %#lx/%#lx at rip %#lx", + info2, info1, state->rip); + } else if (svm_npf_emul_fault(info1)) { + svm_handle_inst_emul(vmcb, info2, vmexit); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " + "for gpa %#lx/%#lx at rip %#lx", + info2, info1, state->rip); + } + break; + case VMCB_EXIT_MONITOR: + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case VMCB_EXIT_MWAIT: + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + default: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", + handled ? "handled" : "unhandled", exit_reason_to_str(code), + vmexit->rip, vmexit->inst_length); + + if (handled) { + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + state->rip = vmexit->rip; + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic SVM exit. + */ + vm_exit_svm(vmexit, code, info1, info2); + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static void +svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu) +{ + uint64_t intinfo; + + if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo)) + return; + + KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " + "valid: %#lx", __func__, intinfo)); + + svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo), + VMCB_EXITINTINFO_VECTOR(intinfo), + VMCB_EXITINTINFO_EC(intinfo), + VMCB_EXITINTINFO_EC_VALID(intinfo)); + vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); + VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo); +} + +/* + * Inject event to virtual cpu. + */ +static void +svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct svm_vcpu *vcpustate; + uint8_t v_tpr; + int vector, need_intr_window; + int extint_pending; + + state = svm_get_vmcb_state(sc, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + vcpustate = svm_get_vcpu(sc, vcpu); + + need_intr_window = 0; + + vlapic_tmr_update(vlapic); + + if (vcpustate->nextrip != state->rip) { + ctrl->intr_shadow = 0; + VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vcpustate->nextrip, state->rip); + } + + /* + * Inject pending events or exceptions for this vcpu. + * + * An event might be pending because the previous #VMEXIT happened + * during event delivery (i.e. ctrl->exitintinfo). + * + * An event might also be pending because an exception was injected + * by the hypervisor (e.g. #PF during instruction emulation). + */ + svm_inj_intinfo(sc, vcpu); + + /* NMI event has priority over interrupts. */ + if (vm_nmi_pending(sc->vm, vcpu)) { + if (nmi_blocked(sc, vcpu)) { + /* + * Can't inject another NMI if the guest has not + * yet executed an "iret" after the last NMI. + */ + VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " + "to NMI-blocking"); + } else if (ctrl->intr_shadow) { + /* + * Can't inject an NMI if the vcpu is in an intr_shadow. + */ + VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to " + "interrupt shadow"); + need_intr_window = 1; + goto done; + } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + /* + * If there is already an exception/interrupt pending + * then defer the NMI until after that. + */ + VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to " + "eventinj %#lx", ctrl->eventinj); + + /* + * Use self-IPI to trigger a VM-exit as soon as + * possible after the event injection is completed. + * + * This works only if the external interrupt exiting + * is at a lower priority than the event injection. + * + * Although not explicitly specified in APMv2 the + * relative priorities were verified empirically. + */ + ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ + } else { + vm_nmi_clear(sc->vm, vcpu); + + /* Inject NMI, vector number is not used */ + svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI, + IDT_NMI, 0, false); + + /* virtual NMI blocking is now in effect */ + enable_nmi_blocking(sc, vcpu); + + VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); + } + } + + extint_pending = vm_extint_pending(sc->vm, vcpu); + if (!extint_pending) { + if (!vlapic_pending_intr(vlapic, &vector)) + goto done; + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(sc->vm, &vector); + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } + + /* + * If the guest has disabled interrupts or is in an interrupt shadow + * then we cannot inject the pending interrupt. + */ + if ((state->rflags & PSL_I) == 0) { + VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, state->rflags); + need_intr_window = 1; + goto done; + } + + if (ctrl->intr_shadow) { + VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to " + "interrupt shadow", vector); + need_intr_window = 1; + goto done; + } + + if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " + "eventinj %#lx", vector, ctrl->eventinj); + need_intr_window = 1; + goto done; + } + + svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); + + if (!extint_pending) { + vlapic_intr_accepted(vlapic, vector); + } else { + vm_extint_clear(sc->vm, vcpu); + vatpic_intr_accepted(sc->vm, vector); + } + + /* + * Force a VM-exit as soon as the vcpu is ready to accept another + * interrupt. This is done because the PIC might have another vector + * that it wants to inject. Also, if the APIC has a pending interrupt + * that was preempted by the ExtInt then it allows us to inject the + * APIC vector as soon as possible. + */ + need_intr_window = 1; +done: + /* + * The guest can modify the TPR by writing to %CR8. In guest mode + * the processor reflects this write to V_TPR without hypervisor + * intervention. + * + * The guest can also modify the TPR by writing to it via the memory + * mapped APIC page. In this case, the write will be emulated by the + * hypervisor. For this reason V_TPR must be updated before every + * VMRUN. + */ + v_tpr = vlapic_get_cr8(vlapic); + KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); + if (ctrl->v_tpr != v_tpr) { + VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x", + ctrl->v_tpr, v_tpr); + ctrl->v_tpr = v_tpr; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + } + + if (need_intr_window) { + /* + * We use V_IRQ in conjunction with the VINTR intercept to + * trap into the hypervisor as soon as a virtual interrupt + * can be delivered. + * + * Since injected events are not subject to intercept checks + * we need to ensure that the V_IRQ is not actually going to + * be delivered on VM entry. The KASSERT below enforces this. + */ + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || + (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, + ("Bogus intr_window_exiting: eventinj (%#lx), " + "intr_shadow (%u), rflags (%#lx)", + ctrl->eventinj, ctrl->intr_shadow, state->rflags)); + enable_intr_window_exiting(sc, vcpu); + } else { + disable_intr_window_exiting(sc, vcpu); + } +} + +static __inline void +restore_host_tss(void) +{ +#ifdef __FreeBSD__ + struct system_segment_descriptor *tss_sd; + + /* + * The TSS descriptor was in use prior to launching the guest so it + * has been marked busy. + * + * 'ltr' requires the descriptor to be marked available so change the + * type to "64-bit available TSS". + */ + tss_sd = PCPU_GET(tss); + tss_sd->sd_type = SDT_SYSTSS; + ltr(GSEL(GPROC0_SEL, SEL_KPL)); +#else + system_desc_t *tss = (system_desc_t *)&CPU->cpu_gdt[GDT_KTSS]; + + tss->ssd_type = SDT_SYSTSS; + wr_tsr(KTSS_SEL); +#endif +} + +#ifdef __FreeBSD__ +static void +check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) +{ + struct svm_vcpu *vcpustate; + struct vmcb_ctrl *ctrl; + long eptgen; + bool alloc_asid; + + KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not " + "active on cpu %u", __func__, thiscpu)); + + vcpustate = svm_get_vcpu(sc, vcpuid); + ctrl = svm_get_vmcb_ctrl(sc, vcpuid); + + /* + * The TLB entries associated with the vcpu's ASID are not valid + * if either of the following conditions is true: + * + * 1. The vcpu's ASID generation is different than the host cpu's + * ASID generation. This happens when the vcpu migrates to a new + * host cpu. It can also happen when the number of vcpus executing + * on a host cpu is greater than the number of ASIDs available. + * + * 2. The pmap generation number is different than the value cached in + * the 'vcpustate'. This happens when the host invalidates pages + * belonging to the guest. + * + * asidgen eptgen Action + * mismatch mismatch + * 0 0 (a) + * 0 1 (b1) or (b2) + * 1 0 (c) + * 1 1 (d) + * + * (a) There is no mismatch in eptgen or ASID generation and therefore + * no further action is needed. + * + * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is + * retained and the TLB entries associated with this ASID + * are flushed by VMRUN. + * + * (b2) If the cpu does not support FlushByAsid then a new ASID is + * allocated. + * + * (c) A new ASID is allocated. + * + * (d) A new ASID is allocated. + */ + + alloc_asid = false; + eptgen = pmap->pm_eptgen; + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; + + if (vcpustate->asid.gen != asid[thiscpu].gen) { + alloc_asid = true; /* (c) and (d) */ + } else if (vcpustate->eptgen != eptgen) { + if (flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ + else + alloc_asid = true; /* (b2) */ + } else { + /* + * This is the common case (a). + */ + KASSERT(!alloc_asid, ("ASID allocation not necessary")); + KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, + ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); + } + + if (alloc_asid) { + if (++asid[thiscpu].num >= nasid) { + asid[thiscpu].num = 1; + if (++asid[thiscpu].gen == 0) + asid[thiscpu].gen = 1; + /* + * If this cpu does not support "flush-by-asid" + * then flush the entire TLB on a generation + * bump. Subsequent ASID allocation in this + * generation can be done without a TLB flush. + */ + if (!flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; + } + vcpustate->asid.gen = asid[thiscpu].gen; + vcpustate->asid.num = asid[thiscpu].num; + + ctrl->asid = vcpustate->asid.num; + svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); + /* + * If this cpu supports "flush-by-asid" then the TLB + * was not flushed after the generation bump. The TLB + * is flushed selectively after every new ASID allocation. + */ + if (flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; + } + vcpustate->eptgen = eptgen; + + KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); + KASSERT(ctrl->asid == vcpustate->asid.num, + ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); +} +#else /* __FreeBSD__ */ +static void +check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) +{ + struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); + struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); + long eptgen; + uint8_t flush; + + eptgen = pmap->pm_eptgen; + flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(), + vcpustate->eptgen == eptgen); + + if (flush != VMCB_TLB_FLUSH_NOTHING) { + ctrl->asid = vcpustate->hma_asid.hsa_asid; + svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); + } + ctrl->tlb_ctrl = flush; + vcpustate->eptgen = eptgen; +} +#endif /* __FreeBSD__ */ + +static __inline void +disable_gintr(void) +{ + + __asm __volatile("clgi"); +} + +static __inline void +enable_gintr(void) +{ + + __asm __volatile("stgi"); +} + +static __inline void +svm_dr_enter_guest(struct svm_regctx *gctx) +{ + + /* Save host control debug registers. */ + gctx->host_dr7 = rdr7(); + gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); + + /* + * Disable debugging in DR7 and DEBUGCTL to avoid triggering + * exceptions in the host based on the guest DRx values. The + * guest DR6, DR7, and DEBUGCTL are saved/restored in the + * VMCB. + */ + load_dr7(0); + wrmsr(MSR_DEBUGCTLMSR, 0); + + /* Save host debug registers. */ + gctx->host_dr0 = rdr0(); + gctx->host_dr1 = rdr1(); + gctx->host_dr2 = rdr2(); + gctx->host_dr3 = rdr3(); + gctx->host_dr6 = rdr6(); + + /* Restore guest debug registers. */ + load_dr0(gctx->sctx_dr0); + load_dr1(gctx->sctx_dr1); + load_dr2(gctx->sctx_dr2); + load_dr3(gctx->sctx_dr3); +} + +static __inline void +svm_dr_leave_guest(struct svm_regctx *gctx) +{ + + /* Save guest debug registers. */ + gctx->sctx_dr0 = rdr0(); + gctx->sctx_dr1 = rdr1(); + gctx->sctx_dr2 = rdr2(); + gctx->sctx_dr3 = rdr3(); + + /* + * Restore host debug registers. Restore DR7 and DEBUGCTL + * last. + */ + load_dr0(gctx->host_dr0); + load_dr1(gctx->host_dr1); + load_dr2(gctx->host_dr2); + load_dr3(gctx->host_dr3); + load_dr6(gctx->host_dr6); + wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl); + load_dr7(gctx->host_dr7); +} + +/* + * Start vcpu with specified RIP. + */ +static int +svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, + struct vm_eventinfo *evinfo) +{ + struct svm_regctx *gctx; + struct svm_softc *svm_sc; + struct svm_vcpu *vcpustate; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct vm_exit *vmexit; + struct vlapic *vlapic; + struct vm *vm; + uint64_t vmcb_pa; + int handled; + uint16_t ldt_sel; + + svm_sc = arg; + vm = svm_sc->vm; + + vcpustate = svm_get_vcpu(svm_sc, vcpu); + state = svm_get_vmcb_state(svm_sc, vcpu); + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + vmexit = vm_exitinfo(vm, vcpu); + vlapic = vm_lapic(vm, vcpu); + + gctx = svm_get_guest_regctx(svm_sc, vcpu); + vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; + + if (vcpustate->lastcpu != curcpu) { + /* + * Force new ASID allocation by invalidating the generation. + */ +#ifdef __FreeBSD__ + vcpustate->asid.gen = 0; +#else + vcpustate->hma_asid.hsa_gen = 0; +#endif + + /* + * Invalidate the VMCB state cache by marking all fields dirty. + */ + svm_set_dirty(svm_sc, vcpu, 0xffffffff); + + /* + * XXX + * Setting 'vcpustate->lastcpu' here is bit premature because + * we may return from this function without actually executing + * the VMRUN instruction. This could happen if an AST or yield + * condition is pending on the first time through the loop. + * + * This works for now but any new side-effects of vcpu + * migration should take this case into account. + */ + vcpustate->lastcpu = curcpu; + vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); + } + + svm_msr_guest_enter(svm_sc, vcpu); + +#ifndef __FreeBSD__ + VERIFY(!vcpustate->loaded && curthread->t_preempt != 0); + vcpustate->loaded = B_TRUE; +#endif + + /* Update Guest RIP */ + state->rip = rip; + + do { +#ifndef __FreeBSD__ + /* + * Interrupt injection may involve mutex contention which, on + * illumos bhyve, are blocking/non-spin. Doing so with global + * interrupts disabled is a recipe for deadlock, so it is + * performed here. + */ + svm_inj_interrupts(svm_sc, vcpu, vlapic); +#endif + + /* + * Disable global interrupts to guarantee atomicity during + * loading of guest state. This includes not only the state + * loaded by the "vmrun" instruction but also software state + * maintained by the hypervisor: suspended and rendezvous + * state, NPT generation number, vlapic interrupts etc. + */ + disable_gintr(); + + if (vcpu_suspended(evinfo)) { + enable_gintr(); + vm_exit_suspended(vm, vcpu, state->rip); + break; + } + + if (vcpu_runblocked(evinfo)) { + enable_gintr(); + vm_exit_runblock(vm, vcpu, state->rip); + break; + } + + if (vcpu_reqidle(evinfo)) { + enable_gintr(); + vm_exit_reqidle(vm, vcpu, state->rip); + break; + } + + /* We are asked to give the cpu by scheduler. */ + if (vcpu_should_yield(vm, vcpu)) { + enable_gintr(); + vm_exit_astpending(vm, vcpu, state->rip); + break; + } + + if (vcpu_debugged(vm, vcpu)) { + enable_gintr(); + vm_exit_debug(vm, vcpu, state->rip); + break; + } + + /* + * #VMEXIT resumes the host with the guest LDTR, so + * save the current LDT selector so it can be restored + * after an exit. The userspace hypervisor probably + * doesn't use a LDT, but save and restore it to be + * safe. + */ + ldt_sel = sldt(); + +#ifdef __FreeBSD__ + svm_inj_interrupts(svm_sc, vcpu, vlapic); +#endif + + /* Activate the nested pmap on 'curcpu' */ + CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active); + + /* + * Check the pmap generation and the ASID generation to + * ensure that the vcpu does not use stale TLB mappings. + */ + check_asid(svm_sc, vcpu, pmap, curcpu); + + ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; + vcpustate->dirty = 0; + VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean); + + /* Launch Virtual Machine. */ + VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip); + svm_dr_enter_guest(gctx); +#ifdef __FreeBSD__ + svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]); +#else + svm_launch(vmcb_pa, gctx, CPU); +#endif + svm_dr_leave_guest(gctx); + + CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); + + /* + * The host GDTR and IDTR is saved by VMRUN and restored + * automatically on #VMEXIT. However, the host TSS needs + * to be restored explicitly. + */ + restore_host_tss(); + + /* Restore host LDTR. */ + lldt(ldt_sel); + + /* #VMEXIT disables interrupts so re-enable them here. */ + enable_gintr(); + + /* Update 'nextrip' */ + vcpustate->nextrip = state->rip; + + /* Handle #VMEXIT and if required return to user space. */ + handled = svm_vmexit(svm_sc, vcpu, vmexit); + } while (handled); + + svm_msr_guest_exit(svm_sc, vcpu); + +#ifndef __FreeBSD__ + VERIFY(vcpustate->loaded && curthread->t_preempt != 0); + vcpustate->loaded = B_FALSE; +#endif + + return (0); +} + +static void +svm_vmcleanup(void *arg) +{ + struct svm_softc *sc = arg; + + contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM); + contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM); + free(sc, M_SVM); +} + +static register_t * +swctx_regptr(struct svm_regctx *regctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RBX: + return (®ctx->sctx_rbx); + case VM_REG_GUEST_RCX: + return (®ctx->sctx_rcx); + case VM_REG_GUEST_RDX: + return (®ctx->sctx_rdx); + case VM_REG_GUEST_RDI: + return (®ctx->sctx_rdi); + case VM_REG_GUEST_RSI: + return (®ctx->sctx_rsi); + case VM_REG_GUEST_RBP: + return (®ctx->sctx_rbp); + case VM_REG_GUEST_R8: + return (®ctx->sctx_r8); + case VM_REG_GUEST_R9: + return (®ctx->sctx_r9); + case VM_REG_GUEST_R10: + return (®ctx->sctx_r10); + case VM_REG_GUEST_R11: + return (®ctx->sctx_r11); + case VM_REG_GUEST_R12: + return (®ctx->sctx_r12); + case VM_REG_GUEST_R13: + return (®ctx->sctx_r13); + case VM_REG_GUEST_R14: + return (®ctx->sctx_r14); + case VM_REG_GUEST_R15: + return (®ctx->sctx_r15); + case VM_REG_GUEST_DR0: + return (®ctx->sctx_dr0); + case VM_REG_GUEST_DR1: + return (®ctx->sctx_dr1); + case VM_REG_GUEST_DR2: + return (®ctx->sctx_dr2); + case VM_REG_GUEST_DR3: + return (®ctx->sctx_dr3); + default: + return (NULL); + } +} + +static int +svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) +{ + struct svm_softc *svm_sc; + register_t *reg; + + svm_sc = arg; + + if (ident == VM_REG_GUEST_INTR_SHADOW) { + return (svm_get_intr_shadow(svm_sc, vcpu, val)); + } + + if (vmcb_read(svm_sc, vcpu, ident, val) == 0) { + return (0); + } + + reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); + + if (reg != NULL) { + *val = *reg; + return (0); + } + + VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident); + return (EINVAL); +} + +static int +svm_setreg(void *arg, int vcpu, int ident, uint64_t val) +{ + struct svm_softc *svm_sc; + register_t *reg; + + svm_sc = arg; + + if (ident == VM_REG_GUEST_INTR_SHADOW) { + return (svm_modify_intr_shadow(svm_sc, vcpu, val)); + } + + if (vmcb_write(svm_sc, vcpu, ident, val) == 0) { + return (0); + } + + reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); + + if (reg != NULL) { + *reg = val; + return (0); + } + + /* + * XXX deal with CR3 and invalidate TLB entries tagged with the + * vcpu's ASID. This needs to be treated differently depending on + * whether 'running' is true/false. + */ + + VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident); + return (EINVAL); +} + +static int +svm_setcap(void *arg, int vcpu, int type, int val) +{ + struct svm_softc *sc; + int error; + + sc = arg; + error = 0; + switch (type) { + case VM_CAP_HALT_EXIT: + svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT, val); + break; + case VM_CAP_PAUSE_EXIT: + svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE, val); + break; + case VM_CAP_UNRESTRICTED_GUEST: + /* Unrestricted guest execution cannot be disabled in SVM */ + if (val == 0) + error = EINVAL; + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static int +svm_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct svm_softc *sc; + int error; + + sc = arg; + error = 0; + + switch (type) { + case VM_CAP_HALT_EXIT: + *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT); + break; + case VM_CAP_PAUSE_EXIT: + *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE); + break; + case VM_CAP_UNRESTRICTED_GUEST: + *retval = 1; /* unrestricted guest is always enabled */ + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static struct vlapic * +svm_vlapic_init(void *arg, int vcpuid) +{ + struct svm_softc *svm_sc; + struct vlapic *vlapic; + + svm_sc = arg; + vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = svm_sc->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic, M_SVM_VLAPIC); +} + +#ifndef __FreeBSD__ +static void +svm_savectx(void *arg, int vcpu) +{ + struct svm_softc *sc = arg; + + if (sc->vcpu[vcpu].loaded) { + svm_msr_guest_exit(sc, vcpu); + } +} + +static void +svm_restorectx(void *arg, int vcpu) +{ + struct svm_softc *sc = arg; + + if (sc->vcpu[vcpu].loaded) { + svm_msr_guest_enter(sc, vcpu); + } +} +#endif /* __FreeBSD__ */ + +struct vmm_ops vmm_ops_amd = { + svm_init, + svm_cleanup, + svm_restore, + svm_vminit, + svm_vmrun, + svm_vmcleanup, + svm_getreg, + svm_setreg, + vmcb_getdesc, + vmcb_setdesc, + svm_getcap, + svm_setcap, + svm_npt_alloc, + svm_npt_free, + svm_vlapic_init, + svm_vlapic_cleanup, + +#ifndef __FreeBSD__ + svm_savectx, + svm_restorectx, +#endif +}; diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.h b/usr/src/uts/i86pc/io/vmm/amd/svm.h new file mode 100644 index 0000000000..c78f7eb067 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.h @@ -0,0 +1,74 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_H_ +#define _SVM_H_ + +/* + * Guest register state that is saved outside the VMCB. + */ +struct svm_regctx { + register_t sctx_rbp; + register_t sctx_rbx; + register_t sctx_rcx; + register_t sctx_rdx; + register_t sctx_rdi; + register_t sctx_rsi; + register_t sctx_r8; + register_t sctx_r9; + register_t sctx_r10; + register_t sctx_r11; + register_t sctx_r12; + register_t sctx_r13; + register_t sctx_r14; + register_t sctx_r15; + register_t sctx_dr0; + register_t sctx_dr1; + register_t sctx_dr2; + register_t sctx_dr3; + + register_t host_dr0; + register_t host_dr1; + register_t host_dr2; + register_t host_dr3; + register_t host_dr6; + register_t host_dr7; + uint64_t host_debugctl; +}; + +#ifdef __FreeBSD__ +struct pcpu; +void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu); +#else +struct cpu; +void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *pcpu); +#endif + +#endif /* _SVM_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c new file mode 100644 index 0000000000..0c1ce0e4e0 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c @@ -0,0 +1,199 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/systm.h> + +#include <machine/cpufunc.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> + +#include "svm.h" +#include "vmcb.h" +#include "svm_softc.h" +#include "svm_msr.h" + +#ifndef MSR_AMDK8_IPM +#define MSR_AMDK8_IPM 0xc0010055 +#endif + +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + HOST_MSR_NUM /* must be the last enumeration */ +}; + +#ifdef __FreeBSD__ +static uint64_t host_msrs[HOST_MSR_NUM]; + +void +svm_msr_init(void) +{ + /* + * It is safe to cache the values of the following MSRs because they + * don't change based on curcpu, curproc or curthread. + */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +} +#else + +CTASSERT(HOST_MSR_NUM == SVM_HOST_MSR_NUM); + +void +svm_msr_init(void) +{ + /* + * These MSRs do vary between CPUs on illumos, so saving system-wide + * values for them serves no purpose. + */ +} +#endif /* __FreeBSD__ */ + +void +svm_msr_guest_init(struct svm_softc *sc, int vcpu) +{ + /* + * All the MSRs accessible to the guest are either saved/restored by + * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored + * by VMSAVE/VMLOAD (e.g., MSR_GSBASE). + * + * There are no guest MSRs that are saved/restored "by hand" so nothing + * more to do here. + */ + return; +} + +void +svm_msr_guest_enter(struct svm_softc *sc, int vcpu) +{ + /* + * Save host MSRs (if any) and restore guest MSRs (if any). + */ +#ifndef __FreeBSD__ + uint64_t *host_msrs = sc->host_msrs[vcpu]; + + /* Save host MSRs */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +#endif /* __FreeBSD__ */ +} + +void +svm_msr_guest_exit(struct svm_softc *sc, int vcpu) +{ +#ifndef __FreeBSD__ + uint64_t *host_msrs = sc->host_msrs[vcpu]; +#endif + /* + * Save guest MSRs (if any) and restore host MSRs. + */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +} + +int +svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, + bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + *result = 0; + break; + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_SYSCFG: + case MSR_AMDK8_IPM: + case MSR_EXTFEATURES: + *result = 0; + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + break; /* ignore writes */ + case MSR_MTRRcap: + vm_inject_gp(sc->vm, vcpu); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_SYSCFG: + break; /* Ignore writes */ + case MSR_AMDK8_IPM: + /* + * Ignore writes to the "Interrupt Pending Message" MSR. + */ + break; + case MSR_K8_UCODE_UPDATE: + /* + * Ignore writes to microcode update register. + */ + break; + case MSR_EXTFEATURES: + break; + default: + error = EINVAL; + break; + } + + return (error); +} diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h new file mode 100644 index 0000000000..1dba8101ab --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h @@ -0,0 +1,46 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_MSR_H_ +#define _SVM_MSR_H_ + +struct svm_softc; + +void svm_msr_init(void); +void svm_msr_guest_init(struct svm_softc *sc, int vcpu); +void svm_msr_guest_enter(struct svm_softc *sc, int vcpu); +void svm_msr_guest_exit(struct svm_softc *sc, int vcpu); + +int svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, + bool *retu); +int svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, + bool *retu); + +#endif /* _SVM_MSR_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h new file mode 100644 index 0000000000..b5ac1903e7 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h @@ -0,0 +1,131 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_SOFTC_H_ +#define _SVM_SOFTC_H_ + +#define SVM_IO_BITMAP_SIZE (3 * PAGE_SIZE) +#define SVM_MSR_BITMAP_SIZE (2 * PAGE_SIZE) + +#ifdef __FreeBSD__ +struct asid { + uint64_t gen; /* range is [1, ~0UL] */ + uint32_t num; /* range is [1, nasid - 1] */ +}; +#else +#include <sys/hma.h> + +/* This must match HOST_MSR_NUM in svm_msr.c (where it is CTASSERTed) */ +#define SVM_HOST_MSR_NUM 4 +#endif /* __FreeBSD__ */ + +/* + * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space + * due to VMCB alignment requirements. + */ +struct svm_vcpu { + struct vmcb vmcb; /* hardware saved vcpu context */ + struct svm_regctx swctx; /* software saved vcpu context */ + uint64_t vmcb_pa; /* VMCB physical address */ + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that the vcpu last ran on */ + uint32_t dirty; /* state cache bits that must be cleared */ + long eptgen; /* pmap->pm_eptgen when the vcpu last ran */ +#ifdef __FreeBSD__ + struct asid asid; +#else + hma_svm_asid_t hma_asid; + boolean_t loaded; +#endif +} __aligned(PAGE_SIZE); + +/* + * SVM softc, one per virtual machine. + */ +struct svm_softc { + uint8_t apic_page[VM_MAXCPU][PAGE_SIZE]; + struct svm_vcpu vcpu[VM_MAXCPU]; + vm_offset_t nptp; /* nested page table */ + uint8_t *iopm_bitmap; /* shared by all vcpus */ + uint8_t *msr_bitmap; /* shared by all vcpus */ + struct vm *vm; +#ifndef __FreeBSD__ + uint64_t host_msrs[VM_MAXCPU][SVM_HOST_MSR_NUM]; +#endif +}; + +CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0); + +static __inline struct svm_vcpu * +svm_get_vcpu(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu])); +} + +static __inline struct vmcb * +svm_get_vmcb(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb)); +} + +static __inline struct vmcb_state * +svm_get_vmcb_state(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb.state)); +} + +static __inline struct vmcb_ctrl * +svm_get_vmcb_ctrl(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb.ctrl)); +} + +static __inline struct svm_regctx * +svm_get_guest_regctx(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].swctx)); +} + +static __inline void +svm_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits) +{ + struct svm_vcpu *vcpustate; + + vcpustate = svm_get_vcpu(sc, vcpu); + + vcpustate->dirty |= dirtybits; +} + +#endif /* _SVM_SOFTC_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s new file mode 100644 index 0000000000..fad994b09c --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s @@ -0,0 +1,164 @@ +/*- + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> + +#include "svm_assym.h" + +/* Porting note: This is named 'svm_support.S' upstream. */ + +#if defined(lint) + +struct svm_regctx; +struct cpu; + +/*ARGSUSED*/ +void +svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *cpu) +{} + +#else /* lint */ + +#define VMLOAD .byte 0x0f, 0x01, 0xda +#define VMRUN .byte 0x0f, 0x01, 0xd8 +#define VMSAVE .byte 0x0f, 0x01, 0xdb + + +/* + * Flush scratch registers to avoid lingering guest state being used for + * Spectre v1 attacks when returning from guest entry. + */ +#define SVM_GUEST_FLUSH_SCRATCH \ + xorl %edi, %edi; \ + xorl %esi, %esi; \ + xorl %edx, %edx; \ + xorl %ecx, %ecx; \ + xorl %r8d, %r8d; \ + xorl %r9d, %r9d; \ + xorl %r10d, %r10d; \ + xorl %r11d, %r11d; + +/* Stack layout (offset from %rsp) for svm_launch */ +#define SVMSTK_R15 0x00 /* callee saved %r15 */ +#define SVMSTK_R14 0x08 /* callee saved %r14 */ +#define SVMSTK_R13 0x10 /* callee saved %r13 */ +#define SVMSTK_R12 0x18 /* callee saved %r12 */ +#define SVMSTK_RBX 0x20 /* callee saved %rbx */ +#define SVMSTK_RDX 0x28 /* save-args %rdx (struct cpu *) */ +#define SVMSTK_RSI 0x30 /* save-args %rsi (struct svm_regctx *) */ +#define SVMSTK_RDI 0x38 /* save-args %rdi (uint64_t vmcb_pa) */ +#define SVMSTK_FP 0x40 /* frame pointer %rbp */ +#define SVMSTKSIZE SVMSTK_FP + +/* + * svm_launch(uint64_t vmcb, struct svm_regctx *gctx, struct pcpu *pcpu) + * %rdi: physical address of VMCB + * %rsi: pointer to guest context + * %rdx: pointer to the pcpu data + */ +ENTRY_NP(svm_launch) + pushq %rbp + movq %rsp, %rbp + subq $SVMSTKSIZE, %rsp + movq %r15, SVMSTK_R15(%rsp) + movq %r14, SVMSTK_R14(%rsp) + movq %r13, SVMSTK_R13(%rsp) + movq %r12, SVMSTK_R12(%rsp) + movq %rbx, SVMSTK_RBX(%rsp) + movq %rdx, SVMSTK_RDX(%rsp) + movq %rsi, SVMSTK_RSI(%rsp) + movq %rdi, SVMSTK_RDI(%rsp) + + /* VMLOAD and VMRUN expect the VMCB physaddr in %rax */ + movq %rdi, %rax + + /* Restore guest state. */ + movq SCTX_R8(%rsi), %r8 + movq SCTX_R9(%rsi), %r9 + movq SCTX_R10(%rsi), %r10 + movq SCTX_R11(%rsi), %r11 + movq SCTX_R12(%rsi), %r12 + movq SCTX_R13(%rsi), %r13 + movq SCTX_R14(%rsi), %r14 + movq SCTX_R15(%rsi), %r15 + movq SCTX_RBP(%rsi), %rbp + movq SCTX_RBX(%rsi), %rbx + movq SCTX_RCX(%rsi), %rcx + movq SCTX_RDX(%rsi), %rdx + movq SCTX_RDI(%rsi), %rdi + movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ + + VMLOAD + VMRUN + VMSAVE + + /* Grab the svm_regctx pointer */ + movq SVMSTK_RSI(%rsp), %rax + + /* Save guest state. */ + movq %r8, SCTX_R8(%rax) + movq %r9, SCTX_R9(%rax) + movq %r10, SCTX_R10(%rax) + movq %r11, SCTX_R11(%rax) + movq %r12, SCTX_R12(%rax) + movq %r13, SCTX_R13(%rax) + movq %r14, SCTX_R14(%rax) + movq %r15, SCTX_R15(%rax) + movq %rbp, SCTX_RBP(%rax) + movq %rbx, SCTX_RBX(%rax) + movq %rcx, SCTX_RCX(%rax) + movq %rdx, SCTX_RDX(%rax) + movq %rdi, SCTX_RDI(%rax) + movq %rsi, SCTX_RSI(%rax) + + /* Restore callee-saved registers */ + movq SVMSTK_R15(%rsp), %r15 + movq SVMSTK_R14(%rsp), %r14 + movq SVMSTK_R13(%rsp), %r13 + movq SVMSTK_R12(%rsp), %r12 + movq SVMSTK_RBX(%rsp), %rbx + + /* Fix %gsbase to point back to the correct 'struct cpu *' */ + movq SVMSTK_RDX(%rsp), %rdx + movl %edx, %eax + shrq $32, %rdx + movl $MSR_GSBASE, %ecx + wrmsr + + SVM_GUEST_FLUSH_SCRATCH + + addq $SVMSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(svm_launch) + +#endif /* lint */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.c b/usr/src/uts/i86pc/io/vmm/amd/vmcb.c new file mode 100644 index 0000000000..5075b69867 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.c @@ -0,0 +1,454 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/segments.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> + +#include "vmm_ktr.h" + +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" + +/* + * The VMCB aka Virtual Machine Control Block is a 4KB aligned page + * in memory that describes the virtual machine. + * + * The VMCB contains: + * - instructions or events in the guest to intercept + * - control bits that modify execution environment of the guest + * - guest processor state (e.g. general purpose registers) + */ + +/* + * Return VMCB segment area. + */ +static struct vmcb_segment * +vmcb_segptr(struct vmcb *vmcb, int type) +{ + struct vmcb_state *state; + struct vmcb_segment *seg; + + state = &vmcb->state; + + switch (type) { + case VM_REG_GUEST_CS: + seg = &state->cs; + break; + + case VM_REG_GUEST_DS: + seg = &state->ds; + break; + + case VM_REG_GUEST_ES: + seg = &state->es; + break; + + case VM_REG_GUEST_FS: + seg = &state->fs; + break; + + case VM_REG_GUEST_GS: + seg = &state->gs; + break; + + case VM_REG_GUEST_SS: + seg = &state->ss; + break; + + case VM_REG_GUEST_GDTR: + seg = &state->gdt; + break; + + case VM_REG_GUEST_IDTR: + seg = &state->idt; + break; + + case VM_REG_GUEST_LDTR: + seg = &state->ldt; + break; + + case VM_REG_GUEST_TR: + seg = &state->tr; + break; + + default: + seg = NULL; + break; + } + + return (seg); +} + +static int +vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident, + uint64_t *val) +{ + struct vmcb *vmcb; + int off, bytes; + char *ptr; + + vmcb = svm_get_vmcb(softc, vcpu); + off = VMCB_ACCESS_OFFSET(ident); + bytes = VMCB_ACCESS_BYTES(ident); + + if ((off + bytes) >= sizeof (struct vmcb)) + return (EINVAL); + + ptr = (char *)vmcb; + + if (!write) + *val = 0; + + switch (bytes) { + case 8: + case 4: + case 2: + if (write) + memcpy(ptr + off, val, bytes); + else + memcpy(val, ptr + off, bytes); + break; + default: + VCPU_CTR1(softc->vm, vcpu, + "Invalid size %d for VMCB access: %d", bytes); + return (EINVAL); + } + + /* Invalidate all VMCB state cached by h/w. */ + if (write) + svm_set_dirty(softc, vcpu, 0xffffffff); + + return (0); +} + +/* + * Read from segment selector, control and general purpose register of VMCB. + */ +int +vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_segment *seg; + int err; + + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + err = 0; + + if (VMCB_ACCESS_OK(ident)) + return (vmcb_access(sc, vcpu, 0, ident, retval)); + + switch (ident) { + case VM_REG_GUEST_CR0: + *retval = state->cr0; + break; + + case VM_REG_GUEST_CR2: + *retval = state->cr2; + break; + + case VM_REG_GUEST_CR3: + *retval = state->cr3; + break; + + case VM_REG_GUEST_CR4: + *retval = state->cr4; + break; + + case VM_REG_GUEST_DR6: + *retval = state->dr6; + break; + + case VM_REG_GUEST_DR7: + *retval = state->dr7; + break; + + case VM_REG_GUEST_EFER: + *retval = state->efer; + break; + + case VM_REG_GUEST_RAX: + *retval = state->rax; + break; + + case VM_REG_GUEST_RFLAGS: + *retval = state->rflags; + break; + + case VM_REG_GUEST_RIP: + *retval = state->rip; + break; + + case VM_REG_GUEST_RSP: + *retval = state->rsp; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + *retval = seg->selector; + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + err = EINVAL; + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +/* + * Write to segment selector, control and general purpose register of VMCB. + */ +int +vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_segment *seg; + int err, dirtyseg; + + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + dirtyseg = 0; + err = 0; + + if (VMCB_ACCESS_OK(ident)) + return (vmcb_access(sc, vcpu, 1, ident, &val)); + + switch (ident) { + case VM_REG_GUEST_CR0: + state->cr0 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_CR2: + state->cr2 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2); + break; + + case VM_REG_GUEST_CR3: + state->cr3 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_CR4: + state->cr4 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_DR6: + state->dr6 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_DR); + break; + + case VM_REG_GUEST_DR7: + state->dr7 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_DR); + break; + + case VM_REG_GUEST_EFER: + /* EFER_SVM must always be set when the guest is executing */ + state->efer = val | EFER_SVM; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_RAX: + state->rax = val; + break; + + case VM_REG_GUEST_RFLAGS: + state->rflags = val; + break; + + case VM_REG_GUEST_RIP: + state->rip = val; + break; + + case VM_REG_GUEST_RSP: + state->rsp = val; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + dirtyseg = 1; /* FALLTHROUGH */ + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + seg->selector = val; + if (dirtyseg) + svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + err = EINVAL; + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +int +vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2) +{ + struct vmcb_segment *seg; + + seg = vmcb_segptr(vmcb, ident); + if (seg != NULL) { + bcopy(seg, seg2, sizeof(struct vmcb_segment)); + return (0); + } else { + return (EINVAL); + } +} + +int +vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + struct vmcb_segment *seg; + uint16_t attrib; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + + seg = vmcb_segptr(vmcb, reg); + KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", + __func__, reg)); + + seg->base = desc->base; + seg->limit = desc->limit; + if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { + /* + * Map seg_desc access to VMCB attribute format. + * + * SVM uses the 'P' bit in the segment attributes to indicate a + * NULL segment so clear it if the segment is marked unusable. + */ + attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF); + if (SEG_DESC_UNUSABLE(desc->access)) { + attrib &= ~0x80; + } + seg->attrib = attrib; + } + + VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), " + "attrib (%#x)", reg, seg->base, seg->limit, seg->attrib); + + switch (reg) { + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); + break; + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + svm_set_dirty(sc, vcpu, VMCB_CACHE_DT); + break; + default: + break; + } + + return (0); +} + +int +vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + struct vmcb_segment *seg; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + seg = vmcb_segptr(vmcb, reg); + KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", + __func__, reg)); + + desc->base = seg->base; + desc->limit = seg->limit; + desc->access = 0; + + if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { + /* Map seg_desc access to VMCB attribute format */ + desc->access = ((seg->attrib & 0xF00) << 4) | + (seg->attrib & 0xFF); + + /* + * VT-x uses bit 16 to indicate a segment that has been loaded + * with a NULL selector (aka unusable). The 'desc->access' + * field is interpreted in the VT-x format by the + * processor-independent code. + * + * SVM uses the 'P' bit to convey the same information so + * convert it into the VT-x format. For more details refer to + * section "Segment State in the VMCB" in APMv2. + */ + if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) { + if ((desc->access & 0x80) == 0) + desc->access |= 0x10000; /* Unusable segment */ + } + } + + return (0); +} diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.h b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h new file mode 100644 index 0000000000..ec7caa91f9 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h @@ -0,0 +1,336 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMCB_H_ +#define _VMCB_H_ + +struct svm_softc; + +#define BIT(n) (1ULL << n) + +/* + * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15 + * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B + */ + +/* vmcb_ctrl->intercept[] array indices */ +#define VMCB_CR_INTCPT 0 +#define VMCB_DR_INTCPT 1 +#define VMCB_EXC_INTCPT 2 +#define VMCB_CTRL1_INTCPT 3 +#define VMCB_CTRL2_INTCPT 4 + +/* intercept[VMCB_CTRL1_INTCPT] fields */ +#define VMCB_INTCPT_INTR BIT(0) +#define VMCB_INTCPT_NMI BIT(1) +#define VMCB_INTCPT_SMI BIT(2) +#define VMCB_INTCPT_INIT BIT(3) +#define VMCB_INTCPT_VINTR BIT(4) +#define VMCB_INTCPT_CR0_WRITE BIT(5) +#define VMCB_INTCPT_IDTR_READ BIT(6) +#define VMCB_INTCPT_GDTR_READ BIT(7) +#define VMCB_INTCPT_LDTR_READ BIT(8) +#define VMCB_INTCPT_TR_READ BIT(9) +#define VMCB_INTCPT_IDTR_WRITE BIT(10) +#define VMCB_INTCPT_GDTR_WRITE BIT(11) +#define VMCB_INTCPT_LDTR_WRITE BIT(12) +#define VMCB_INTCPT_TR_WRITE BIT(13) +#define VMCB_INTCPT_RDTSC BIT(14) +#define VMCB_INTCPT_RDPMC BIT(15) +#define VMCB_INTCPT_PUSHF BIT(16) +#define VMCB_INTCPT_POPF BIT(17) +#define VMCB_INTCPT_CPUID BIT(18) +#define VMCB_INTCPT_RSM BIT(19) +#define VMCB_INTCPT_IRET BIT(20) +#define VMCB_INTCPT_INTn BIT(21) +#define VMCB_INTCPT_INVD BIT(22) +#define VMCB_INTCPT_PAUSE BIT(23) +#define VMCB_INTCPT_HLT BIT(24) +#define VMCB_INTCPT_INVPG BIT(25) +#define VMCB_INTCPT_INVPGA BIT(26) +#define VMCB_INTCPT_IO BIT(27) +#define VMCB_INTCPT_MSR BIT(28) +#define VMCB_INTCPT_TASK_SWITCH BIT(29) +#define VMCB_INTCPT_FERR_FREEZE BIT(30) +#define VMCB_INTCPT_SHUTDOWN BIT(31) + +/* intercept[VMCB_CTRL2_INTCPT] fields */ +#define VMCB_INTCPT_VMRUN BIT(0) +#define VMCB_INTCPT_VMMCALL BIT(1) +#define VMCB_INTCPT_VMLOAD BIT(2) +#define VMCB_INTCPT_VMSAVE BIT(3) +#define VMCB_INTCPT_STGI BIT(4) +#define VMCB_INTCPT_CLGI BIT(5) +#define VMCB_INTCPT_SKINIT BIT(6) +#define VMCB_INTCPT_RDTSCP BIT(7) +#define VMCB_INTCPT_ICEBP BIT(8) +#define VMCB_INTCPT_WBINVD BIT(9) +#define VMCB_INTCPT_MONITOR BIT(10) +#define VMCB_INTCPT_MWAIT BIT(11) +#define VMCB_INTCPT_MWAIT_ARMED BIT(12) +#define VMCB_INTCPT_XSETBV BIT(13) + +/* VMCB TLB control */ +#define VMCB_TLB_FLUSH_NOTHING 0 /* Flush nothing */ +#define VMCB_TLB_FLUSH_ALL 1 /* Flush entire TLB */ +#define VMCB_TLB_FLUSH_GUEST 3 /* Flush all guest entries */ +#define VMCB_TLB_FLUSH_GUEST_NONGLOBAL 7 /* Flush guest non-PG entries */ + +/* VMCB state caching */ +#define VMCB_CACHE_NONE 0 /* No caching */ +#define VMCB_CACHE_I BIT(0) /* Intercept, TSC off, Pause filter */ +#define VMCB_CACHE_IOPM BIT(1) /* I/O and MSR permission */ +#define VMCB_CACHE_ASID BIT(2) /* ASID */ +#define VMCB_CACHE_TPR BIT(3) /* V_TPR to V_INTR_VECTOR */ +#define VMCB_CACHE_NP BIT(4) /* Nested Paging */ +#define VMCB_CACHE_CR BIT(5) /* CR0, CR3, CR4 & EFER */ +#define VMCB_CACHE_DR BIT(6) /* Debug registers */ +#define VMCB_CACHE_DT BIT(7) /* GDT/IDT */ +#define VMCB_CACHE_SEG BIT(8) /* User segments, CPL */ +#define VMCB_CACHE_CR2 BIT(9) /* page fault address */ +#define VMCB_CACHE_LBR BIT(10) /* Last branch */ + +/* VMCB control event injection */ +#define VMCB_EVENTINJ_EC_VALID BIT(11) /* Error Code valid */ +#define VMCB_EVENTINJ_VALID BIT(31) /* Event valid */ + +/* Event types that can be injected */ +#define VMCB_EVENTINJ_TYPE_INTR 0 +#define VMCB_EVENTINJ_TYPE_NMI 2 +#define VMCB_EVENTINJ_TYPE_EXCEPTION 3 +#define VMCB_EVENTINJ_TYPE_INTn 4 + +/* VMCB exit code, APM vol2 Appendix C */ +#define VMCB_EXIT_MC 0x52 +#define VMCB_EXIT_INTR 0x60 +#define VMCB_EXIT_NMI 0x61 +#define VMCB_EXIT_VINTR 0x64 +#define VMCB_EXIT_PUSHF 0x70 +#define VMCB_EXIT_POPF 0x71 +#define VMCB_EXIT_CPUID 0x72 +#define VMCB_EXIT_IRET 0x74 +#define VMCB_EXIT_PAUSE 0x77 +#define VMCB_EXIT_HLT 0x78 +#define VMCB_EXIT_IO 0x7B +#define VMCB_EXIT_MSR 0x7C +#define VMCB_EXIT_SHUTDOWN 0x7F +#define VMCB_EXIT_VMSAVE 0x83 +#define VMCB_EXIT_MONITOR 0x8A +#define VMCB_EXIT_MWAIT 0x8B +#define VMCB_EXIT_NPF 0x400 +#define VMCB_EXIT_INVALID -1 + +/* + * Nested page fault. + * Bit definitions to decode EXITINFO1. + */ +#define VMCB_NPF_INFO1_P BIT(0) /* Nested page present. */ +#define VMCB_NPF_INFO1_W BIT(1) /* Access was write. */ +#define VMCB_NPF_INFO1_U BIT(2) /* Access was user access. */ +#define VMCB_NPF_INFO1_RSV BIT(3) /* Reserved bits present. */ +#define VMCB_NPF_INFO1_ID BIT(4) /* Code read. */ + +#define VMCB_NPF_INFO1_GPA BIT(32) /* Guest physical address. */ +#define VMCB_NPF_INFO1_GPT BIT(33) /* Guest page table. */ + +/* + * EXITINTINFO, Interrupt exit info for all intrecepts. + * Section 15.7.2, Intercepts during IDT Interrupt Delivery. + */ +#define VMCB_EXITINTINFO_VECTOR(x) ((x) & 0xFF) +#define VMCB_EXITINTINFO_TYPE(x) (((x) >> 8) & 0x7) +#define VMCB_EXITINTINFO_EC_VALID(x) (((x) & BIT(11)) ? 1 : 0) +#define VMCB_EXITINTINFO_VALID(x) (((x) & BIT(31)) ? 1 : 0) +#define VMCB_EXITINTINFO_EC(x) (((x) >> 32) & 0xFFFFFFFF) + +/* Offset of various VMCB fields. */ +#define VMCB_OFF_CTRL(x) (x) +#define VMCB_OFF_STATE(x) ((x) + 0x400) + +#define VMCB_OFF_CR_INTERCEPT VMCB_OFF_CTRL(0x0) +#define VMCB_OFF_DR_INTERCEPT VMCB_OFF_CTRL(0x4) +#define VMCB_OFF_EXC_INTERCEPT VMCB_OFF_CTRL(0x8) +#define VMCB_OFF_INST1_INTERCEPT VMCB_OFF_CTRL(0xC) +#define VMCB_OFF_INST2_INTERCEPT VMCB_OFF_CTRL(0x10) +#define VMCB_OFF_IO_PERM VMCB_OFF_CTRL(0x40) +#define VMCB_OFF_MSR_PERM VMCB_OFF_CTRL(0x48) +#define VMCB_OFF_TSC_OFFSET VMCB_OFF_CTRL(0x50) +#define VMCB_OFF_ASID VMCB_OFF_CTRL(0x58) +#define VMCB_OFF_TLB_CTRL VMCB_OFF_CTRL(0x5C) +#define VMCB_OFF_VIRQ VMCB_OFF_CTRL(0x60) +#define VMCB_OFF_EXIT_REASON VMCB_OFF_CTRL(0x70) +#define VMCB_OFF_EXITINFO1 VMCB_OFF_CTRL(0x78) +#define VMCB_OFF_EXITINFO2 VMCB_OFF_CTRL(0x80) +#define VMCB_OFF_EXITINTINFO VMCB_OFF_CTRL(0x88) +#define VMCB_OFF_AVIC_BAR VMCB_OFF_CTRL(0x98) +#define VMCB_OFF_NPT_BASE VMCB_OFF_CTRL(0xB0) +#define VMCB_OFF_AVIC_PAGE VMCB_OFF_CTRL(0xE0) +#define VMCB_OFF_AVIC_LT VMCB_OFF_CTRL(0xF0) +#define VMCB_OFF_AVIC_PT VMCB_OFF_CTRL(0xF8) +#define VMCB_OFF_SYSENTER_CS VMCB_OFF_STATE(0x228) +#define VMCB_OFF_SYSENTER_ESP VMCB_OFF_STATE(0x230) +#define VMCB_OFF_SYSENTER_EIP VMCB_OFF_STATE(0x238) +#define VMCB_OFF_GUEST_PAT VMCB_OFF_STATE(0x268) + +/* + * Encode the VMCB offset and bytes that we want to read from VMCB. + */ +#define VMCB_ACCESS(o, w) (0x80000000 | (((w) & 0xF) << 16) | \ + ((o) & 0xFFF)) +#define VMCB_ACCESS_OK(v) ((v) & 0x80000000 ) +#define VMCB_ACCESS_BYTES(v) (((v) >> 16) & 0xF) +#define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF) + +#ifdef _KERNEL +/* VMCB save state area segment format */ +struct vmcb_segment { + uint16_t selector; + uint16_t attrib; + uint32_t limit; + uint64_t base; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_segment) == 16); + +/* Code segment descriptor attribute in 12 bit format as saved by VMCB. */ +#define VMCB_CS_ATTRIB_L BIT(9) /* Long mode. */ +#define VMCB_CS_ATTRIB_D BIT(10) /* OPerand size bit. */ + +/* + * The VMCB is divided into two areas - the first one contains various + * control bits including the intercept vector and the second one contains + * the guest state. + */ + +/* VMCB control area - padded up to 1024 bytes */ +struct vmcb_ctrl { + uint32_t intercept[5]; /* all intercepts */ + uint8_t pad1[0x28]; /* Offsets 0x14-0x3B are reserved. */ + uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */ + uint16_t pause_filcnt; /* Offset 0x3E, PAUSE filter count */ + uint64_t iopm_base_pa; /* 0x40: IOPM_BASE_PA */ + uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */ + uint64_t tsc_offset; /* 0x50: TSC_OFFSET */ + uint32_t asid; /* 0x58: Guest ASID */ + uint8_t tlb_ctrl; /* 0x5C: TLB_CONTROL */ + uint8_t pad2[3]; /* 0x5D-0x5F: Reserved. */ + uint8_t v_tpr; /* 0x60: V_TPR, guest CR8 */ + uint8_t v_irq:1; /* Is virtual interrupt pending? */ + uint8_t :7; /* Padding */ + uint8_t v_intr_prio:4; /* 0x62: Priority for virtual interrupt. */ + uint8_t v_ign_tpr:1; + uint8_t :3; + uint8_t v_intr_masking:1; /* Guest and host sharing of RFLAGS. */ + uint8_t :7; + uint8_t v_intr_vector; /* 0x64: Vector for virtual interrupt. */ + uint8_t pad3[3]; /* 0x65-0x67 Reserved. */ + uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */ + uint64_t :63; + uint64_t exitcode; /* 0x70, Exitcode */ + uint64_t exitinfo1; /* 0x78, EXITINFO1 */ + uint64_t exitinfo2; /* 0x80, EXITINFO2 */ + uint64_t exitintinfo; /* 0x88, Interrupt exit value. */ + uint64_t np_enable:1; /* 0x90, Nested paging enable. */ + uint64_t :63; + uint8_t pad4[0x10]; /* 0x98-0xA7 reserved. */ + uint64_t eventinj; /* 0xA8, Event injection. */ + uint64_t n_cr3; /* B0, Nested page table. */ + uint64_t lbr_virt_en:1; /* Enable LBR virtualization. */ + uint64_t :63; + uint32_t vmcb_clean; /* 0xC0: VMCB clean bits for caching */ + uint32_t :32; /* 0xC4: Reserved */ + uint64_t nrip; /* 0xC8: Guest next nRIP. */ + uint8_t inst_len; /* 0xD0: #NPF decode assist */ + uint8_t inst_bytes[15]; + uint8_t padd6[0x320]; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_ctrl) == 1024); + +struct vmcb_state { + struct vmcb_segment es; + struct vmcb_segment cs; + struct vmcb_segment ss; + struct vmcb_segment ds; + struct vmcb_segment fs; + struct vmcb_segment gs; + struct vmcb_segment gdt; + struct vmcb_segment ldt; + struct vmcb_segment idt; + struct vmcb_segment tr; + uint8_t pad1[0x2b]; /* Reserved: 0xA0-0xCA */ + uint8_t cpl; + uint8_t pad2[4]; + uint64_t efer; + uint8_t pad3[0x70]; /* Reserved: 0xd8-0x147 */ + uint64_t cr4; + uint64_t cr3; /* Guest CR3 */ + uint64_t cr0; + uint64_t dr7; + uint64_t dr6; + uint64_t rflags; + uint64_t rip; + uint8_t pad4[0x58]; /* Reserved: 0x180-0x1D7 */ + uint64_t rsp; + uint8_t pad5[0x18]; /* Reserved 0x1E0-0x1F7 */ + uint64_t rax; + uint64_t star; + uint64_t lstar; + uint64_t cstar; + uint64_t sfmask; + uint64_t kernelgsbase; + uint64_t sysenter_cs; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + uint64_t cr2; + uint8_t pad6[0x20]; + uint64_t g_pat; + uint64_t dbgctl; + uint64_t br_from; + uint64_t br_to; + uint64_t int_from; + uint64_t int_to; + uint8_t pad7[0x968]; /* Reserved up to end of VMCB */ +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_state) == 0xC00); + +struct vmcb { + struct vmcb_ctrl ctrl; + struct vmcb_state state; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb) == PAGE_SIZE); +CTASSERT(offsetof(struct vmcb, state) == 0x400); + +int vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval); +int vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val); +int vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); +int vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); +int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg); + +#endif /* _KERNEL */ +#endif /* _VMCB_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.c b/usr/src/uts/i86pc/io/vmm/intel/ept.c new file mode 100644 index 0000000000..4915537b0a --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/ept.c @@ -0,0 +1,230 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#ifndef __FreeBSD__ +#include <sys/hma.h> +#endif + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> + +#include <machine/vmm.h> + +#include "vmx_cpufunc.h" +#include "ept.h" + +#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0)) +#define EPT_PWL4(cap) ((cap) & (1UL << 6)) +#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) +#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ +#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ +#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) +#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21)) +#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) + +#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL +#define INVVPID_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK) + +#define INVEPT_ALL_TYPES_MASK 0x6000000UL +#define INVEPT_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) + +#define EPT_PWLEVELS 4 /* page walk levels */ +#define EPT_ENABLE_AD_BITS (1 << 6) + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL); + +static int ept_enable_ad_bits; + +static int ept_pmap_flags; +SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD, + &ept_pmap_flags, 0, NULL); + +int +ept_init(int ipinum) +{ + int use_hw_ad_bits, use_superpages, use_exec_only; + uint64_t cap; + + cap = rdmsr(MSR_VMX_EPT_VPID_CAP); + + /* + * Verify that: + * - page walk length is 4 steps + * - extended page tables can be laid out in write-back memory + * - invvpid instruction with all possible types is supported + * - invept instruction with all possible types is supported + */ + if (!EPT_PWL4(cap) || + !EPT_MEMORY_TYPE_WB(cap) || + !INVVPID_SUPPORTED(cap) || + !INVVPID_ALL_TYPES_SUPPORTED(cap) || + !INVEPT_SUPPORTED(cap) || + !INVEPT_ALL_TYPES_SUPPORTED(cap)) + return (EINVAL); + + ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK; + + use_superpages = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages); + if (use_superpages && EPT_PDE_SUPERPAGE(cap)) + ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */ + + use_hw_ad_bits = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits); + if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap)) + ept_enable_ad_bits = 1; + else + ept_pmap_flags |= PMAP_EMULATE_AD_BITS; + + use_exec_only = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only); + if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap)) + ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY; + + return (0); +} + +#if 0 +static void +ept_dump(uint64_t *ptp, int nlevels) +{ + int i, t, tabs; + uint64_t *ptpnext, ptpval; + + if (--nlevels < 0) + return; + + tabs = 3 - nlevels; + for (t = 0; t < tabs; t++) + printf("\t"); + printf("PTP = %p\n", ptp); + + for (i = 0; i < 512; i++) { + ptpval = ptp[i]; + + if (ptpval == 0) + continue; + + for (t = 0; t < tabs; t++) + printf("\t"); + printf("%3d 0x%016lx\n", i, ptpval); + + if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) { + ptpnext = (uint64_t *) + PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); + ept_dump(ptpnext, nlevels); + } + } +} +#endif + +#ifdef __FreeBSD__ +static void +invept_single_context(void *arg) +{ + struct invept_desc desc = *(struct invept_desc *)arg; + + invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); +} + +void +ept_invalidate_mappings(u_long eptp) +{ + struct invept_desc invept_desc = { 0 }; + + invept_desc.eptp = eptp; + + smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); +} +#else /* __FreeBSD__ */ +void +ept_invalidate_mappings(u_long eptp) +{ + hma_vmx_invept_allcpus((uintptr_t)eptp); +} +#endif /* __FreeBSD__ */ + +static int +ept_pinit(pmap_t pmap) +{ + + return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags)); +} + +struct vmspace * +ept_vmspace_alloc(vm_offset_t min, vm_offset_t max) +{ + + return (vmspace_alloc(min, max, ept_pinit)); +} + +void +ept_vmspace_free(struct vmspace *vmspace) +{ + + vmspace_free(vmspace); +} + +uint64_t +eptp(uint64_t pml4) +{ + uint64_t eptp_val; + + eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK; + if (ept_enable_ad_bits) + eptp_val |= EPT_ENABLE_AD_BITS; + + return (eptp_val); +} diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.h b/usr/src/uts/i86pc/io/vmm/intel/ept.h new file mode 100644 index 0000000000..4a029e8b22 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/ept.h @@ -0,0 +1,41 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _EPT_H_ +#define _EPT_H_ + +struct vmx; + +int ept_init(int ipinum); +void ept_invalidate_mappings(u_long eptp); +struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max); +void ept_vmspace_free(struct vmspace *vmspace); +uint64_t eptp(uint64_t pml4); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/offsets.in b/usr/src/uts/i86pc/io/vmm/intel/offsets.in new file mode 100644 index 0000000000..d60a2d8f5f --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/offsets.in @@ -0,0 +1,62 @@ +/* + * COPYRIGHT 2014 Pluribus Networks Inc. + * + * All rights reserved. This copyright notice is Copyright Management + * Information under 17 USC 1202 and is included to protect this work and + * deter copyright infringement. Removal or alteration of this Copyright + * Management Information without the express written permission from + * Pluribus Networks Inc is prohibited, and any such unauthorized removal + * or alteration will be a violation of federal law. + */ +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cpuvar.h> + +#include <machine/pmap.h> +#include <machine/vmm.h> + +#include "intel/vmx_cpufunc.h" +#include "intel/vmx.h" +#include "vm/vm_glue.h" + +vmxctx + guest_rdi VMXCTX_GUEST_RDI + guest_rsi VMXCTX_GUEST_RSI + guest_rdx VMXCTX_GUEST_RDX + guest_rcx VMXCTX_GUEST_RCX + guest_r8 VMXCTX_GUEST_R8 + guest_r9 VMXCTX_GUEST_R9 + guest_rax VMXCTX_GUEST_RAX + guest_rbx VMXCTX_GUEST_RBX + guest_rbp VMXCTX_GUEST_RBP + guest_r10 VMXCTX_GUEST_R10 + guest_r11 VMXCTX_GUEST_R11 + guest_r12 VMXCTX_GUEST_R12 + guest_r13 VMXCTX_GUEST_R13 + guest_r14 VMXCTX_GUEST_R14 + guest_r15 VMXCTX_GUEST_R15 + guest_cr2 VMXCTX_GUEST_CR2 + inst_fail_status VMXCTX_INST_FAIL_STATUS + pmap VMXCTX_PMAP + +vmx + eptgen VMX_EPTGEN + eptp VMX_EPTP + +pmap + pm_active PM_ACTIVE + pm_eptgen PM_EPTGEN + +cpu + cpu_id + +\#define VM_SUCCESS 0 +\#define VM_FAIL_INVALID 1 +\#define VM_FAIL_VALID 2 + +\#define VMX_GUEST_VMEXIT 0 +\#define VMX_VMRESUME_ERROR 1 +\#define VMX_VMLAUNCH_ERROR 2 +\#define VMX_INVEPT_ERROR 3 +\#define VMX_VMWRITE_ERROR 4 diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.c b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c new file mode 100644 index 0000000000..d19f6bc262 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c @@ -0,0 +1,560 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. + */ + +#ifdef __FreeBSD__ +#include "opt_ddb.h" +#endif + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/pcpu.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/segments.h> +#include <machine/vmm.h> +#include "vmm_host.h" +#include "vmx_cpufunc.h" +#include "vmcs.h" +#include "ept.h" +#include "vmx.h" + +#ifdef DDB +#include <ddb/ddb.h> +#endif + +SYSCTL_DECL(_hw_vmm_vmx); + +static int no_flush_rsb; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, no_flush_rsb, CTLFLAG_RW, + &no_flush_rsb, 0, "Do not flush RSB upon vmexit"); + +static uint64_t +vmcs_fix_regval(uint32_t encoding, uint64_t val) +{ + + switch (encoding) { + case VMCS_GUEST_CR0: + val = vmx_fix_cr0(val); + break; + case VMCS_GUEST_CR4: + val = vmx_fix_cr4(val); + break; + default: + break; + } + return (val); +} + +static uint32_t +vmcs_field_encoding(int ident) +{ + switch (ident) { + case VM_REG_GUEST_CR0: + return (VMCS_GUEST_CR0); + case VM_REG_GUEST_CR3: + return (VMCS_GUEST_CR3); + case VM_REG_GUEST_CR4: + return (VMCS_GUEST_CR4); + case VM_REG_GUEST_DR7: + return (VMCS_GUEST_DR7); + case VM_REG_GUEST_RSP: + return (VMCS_GUEST_RSP); + case VM_REG_GUEST_RIP: + return (VMCS_GUEST_RIP); + case VM_REG_GUEST_RFLAGS: + return (VMCS_GUEST_RFLAGS); + case VM_REG_GUEST_ES: + return (VMCS_GUEST_ES_SELECTOR); + case VM_REG_GUEST_CS: + return (VMCS_GUEST_CS_SELECTOR); + case VM_REG_GUEST_SS: + return (VMCS_GUEST_SS_SELECTOR); + case VM_REG_GUEST_DS: + return (VMCS_GUEST_DS_SELECTOR); + case VM_REG_GUEST_FS: + return (VMCS_GUEST_FS_SELECTOR); + case VM_REG_GUEST_GS: + return (VMCS_GUEST_GS_SELECTOR); + case VM_REG_GUEST_TR: + return (VMCS_GUEST_TR_SELECTOR); + case VM_REG_GUEST_LDTR: + return (VMCS_GUEST_LDTR_SELECTOR); + case VM_REG_GUEST_EFER: + return (VMCS_GUEST_IA32_EFER); + case VM_REG_GUEST_PDPTE0: + return (VMCS_GUEST_PDPTE0); + case VM_REG_GUEST_PDPTE1: + return (VMCS_GUEST_PDPTE1); + case VM_REG_GUEST_PDPTE2: + return (VMCS_GUEST_PDPTE2); + case VM_REG_GUEST_PDPTE3: + return (VMCS_GUEST_PDPTE3); + default: + return (-1); + } + +} + +static int +vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) +{ + + switch (seg) { + case VM_REG_GUEST_ES: + *base = VMCS_GUEST_ES_BASE; + *lim = VMCS_GUEST_ES_LIMIT; + *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_CS: + *base = VMCS_GUEST_CS_BASE; + *lim = VMCS_GUEST_CS_LIMIT; + *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_SS: + *base = VMCS_GUEST_SS_BASE; + *lim = VMCS_GUEST_SS_LIMIT; + *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_DS: + *base = VMCS_GUEST_DS_BASE; + *lim = VMCS_GUEST_DS_LIMIT; + *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_FS: + *base = VMCS_GUEST_FS_BASE; + *lim = VMCS_GUEST_FS_LIMIT; + *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_GS: + *base = VMCS_GUEST_GS_BASE; + *lim = VMCS_GUEST_GS_LIMIT; + *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_TR: + *base = VMCS_GUEST_TR_BASE; + *lim = VMCS_GUEST_TR_LIMIT; + *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_LDTR: + *base = VMCS_GUEST_LDTR_BASE; + *lim = VMCS_GUEST_LDTR_LIMIT; + *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_IDTR: + *base = VMCS_GUEST_IDTR_BASE; + *lim = VMCS_GUEST_IDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + case VM_REG_GUEST_GDTR: + *base = VMCS_GUEST_GDTR_BASE; + *lim = VMCS_GUEST_GDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + default: + return (EINVAL); + } + + return (0); +} + +int +vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval) +{ + int error; + uint32_t encoding; + + /* + * If we need to get at vmx-specific state in the VMCS we can bypass + * the translation of 'ident' to 'encoding' by simply setting the + * sign bit. As it so happens the upper 16 bits are reserved (i.e + * set to 0) in the encodings for the VMCS so we are free to use the + * sign bit. + */ + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + if (!running) + VMPTRLD(vmcs); + + error = vmread(encoding, retval); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val) +{ + int error; + uint32_t encoding; + + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + val = vmcs_fix_regval(encoding, val); + + if (!running) + VMPTRLD(vmcs); + + error = vmwrite(encoding, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_setdesc: invalid segment register %d", seg); + + if (!running) + VMPTRLD(vmcs); + if ((error = vmwrite(base, desc->base)) != 0) + goto done; + + if ((error = vmwrite(limit, desc->limit)) != 0) + goto done; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmwrite(access, desc->access)) != 0) + goto done; + } +done: + if (!running) + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + uint64_t u64; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_getdesc: invalid segment register %d", seg); + + if (!running) + VMPTRLD(vmcs); + if ((error = vmread(base, &u64)) != 0) + goto done; + desc->base = u64; + + if ((error = vmread(limit, &u64)) != 0) + goto done; + desc->limit = u64; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmread(access, &u64)) != 0) + goto done; + desc->access = u64; + } +done: + if (!running) + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count) +{ + int error; + + VMPTRLD(vmcs); + + /* + * Guest MSRs are saved in the VM-exit MSR-store area. + * Guest MSRs are loaded from the VM-entry MSR-load area. + * Both areas point to the same location in memory. + */ + if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0) + goto done; + + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0) + goto done; + + error = 0; +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_init(struct vmcs *vmcs) +{ + int error, codesel, datasel, tsssel; + u_long cr0, cr4, efer; + uint64_t pat; +#ifdef __FreeBSD__ + uint64_t fsbase, idtrbase; +#endif + + codesel = vmm_get_host_codesel(); + datasel = vmm_get_host_datasel(); + tsssel = vmm_get_host_tsssel(); + + /* + * Make sure we have a "current" VMCS to work with. + */ + VMPTRLD(vmcs); + + /* Host state */ + + /* Initialize host IA32_PAT MSR */ + pat = vmm_get_host_pat(); + if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0) + goto done; + + /* Load the IA32_EFER MSR */ + efer = vmm_get_host_efer(); + if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0) + goto done; + + /* Load the control registers */ + + cr0 = vmm_get_host_cr0(); + if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) + goto done; + + cr4 = vmm_get_host_cr4() | CR4_VMXE; + if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0) + goto done; + + /* Load the segment selectors */ + if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0) + goto done; + +#ifdef __FreeBSD__ + if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0) + goto done; +#else + if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, vmm_get_host_fssel())) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, vmm_get_host_gssel())) != 0) + goto done; +#endif + + if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0) + goto done; + +#ifdef __FreeBSD__ + /* + * Load the Base-Address for %fs and idtr. + * + * Note that we exclude %gs, tss and gdtr here because their base + * address is pcpu specific. + */ + fsbase = vmm_get_host_fsbase(); + if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0) + goto done; + + idtrbase = vmm_get_host_idtrbase(); + if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0) + goto done; + +#else /* __FreeBSD__ */ + /* + * Configure host sysenter MSRs to be restored on VM exit. + * The thread-specific MSR_INTC_SEP_ESP value is loaded in vmx_run. + */ + if ((error = vmwrite(VMCS_HOST_IA32_SYSENTER_CS, KCS_SEL)) != 0) + goto done; + /* Natively defined as MSR_INTC_SEP_EIP */ + if ((error = vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, + rdmsr(MSR_SYSENTER_EIP_MSR))) != 0) + goto done; + +#endif /* __FreeBSD__ */ + + /* instruction pointer */ + if (no_flush_rsb) { + if ((error = vmwrite(VMCS_HOST_RIP, + (u_long)vmx_exit_guest)) != 0) + goto done; + } else { + if ((error = vmwrite(VMCS_HOST_RIP, + (u_long)vmx_exit_guest_flush_rsb)) != 0) + goto done; + } + + /* link pointer */ + if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) + goto done; +done: + VMCLEAR(vmcs); + return (error); +} + +#ifdef DDB +extern int vmxon_enabled[]; + +DB_SHOW_COMMAND(vmcs, db_show_vmcs) +{ + uint64_t cur_vmcs, val; + uint32_t exit; + + if (!vmxon_enabled[curcpu]) { + db_printf("VMX not enabled\n"); + return; + } + + if (have_addr) { + db_printf("Only current VMCS supported\n"); + return; + } + + vmptrst(&cur_vmcs); + if (cur_vmcs == VMCS_INITIAL) { + db_printf("No current VM context\n"); + return; + } + db_printf("VMCS: %jx\n", cur_vmcs); + db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID)); + db_printf("Activity: "); + val = vmcs_read(VMCS_GUEST_ACTIVITY); + switch (val) { + case 0: + db_printf("Active"); + break; + case 1: + db_printf("HLT"); + break; + case 2: + db_printf("Shutdown"); + break; + case 3: + db_printf("Wait for SIPI"); + break; + default: + db_printf("Unknown: %#lx", val); + } + db_printf("\n"); + exit = vmcs_read(VMCS_EXIT_REASON); + if (exit & 0x80000000) + db_printf("Entry Failure Reason: %u\n", exit & 0xffff); + else + db_printf("Exit Reason: %u\n", exit & 0xffff); + db_printf("Qualification: %#lx\n", vmcs_exit_qualification()); + db_printf("Guest Linear Address: %#lx\n", + vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)); + switch (exit & 0x8000ffff) { + case EXIT_REASON_EXCEPTION: + case EXIT_REASON_EXT_INTR: + val = vmcs_read(VMCS_EXIT_INTR_INFO); + db_printf("Interrupt Type: "); + switch (val >> 8 & 0x7) { + case 0: + db_printf("external"); + break; + case 2: + db_printf("NMI"); + break; + case 3: + db_printf("HW exception"); + break; + case 4: + db_printf("SW exception"); + break; + default: + db_printf("?? %lu", val >> 8 & 0x7); + break; + } + db_printf(" Vector: %lu", val & 0xff); + if (val & 0x800) + db_printf(" Error Code: %lx", + vmcs_read(VMCS_EXIT_INTR_ERRCODE)); + db_printf("\n"); + break; + case EXIT_REASON_EPT_FAULT: + case EXIT_REASON_EPT_MISCONFIG: + db_printf("Guest Physical Address: %#lx\n", + vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)); + break; + } + db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error()); +} +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.h b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h new file mode 100644 index 0000000000..edde5c6dd5 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h @@ -0,0 +1,492 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VMCS_H_ +#define _VMCS_H_ + +#ifdef _KERNEL +#ifndef _ASM +struct vmcs { + uint32_t identifier; + uint32_t abort_code; + char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2]; +#ifndef __FreeBSD__ + /* + * Keep the physical address of the VMCS cached adjacent for the + * structure so it can be referenced in contexts which are too delicate + * for a call into the HAT. For the moment it means wasting a whole + * page on padding for the PA value to maintain alignment, but it + * allows the consumers of 'struct vmcs *' to easily access the value + * without a significant change to the interface. + */ + uint64_t vmcs_pa; + char _pa_pad[PAGE_SIZE - sizeof (vm_paddr_t)]; +#endif +}; +#ifdef __FreeBSD__ +CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); +#else +CTASSERT(sizeof(struct vmcs) == (2*PAGE_SIZE)); +#endif + +/* MSR save region is composed of an array of 'struct msr_entry' */ +struct msr_entry { + uint32_t index; + uint32_t reserved; + uint64_t val; + +}; + +int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); +int vmcs_init(struct vmcs *vmcs); +int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv); +int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val); +int vmcs_getdesc(struct vmcs *vmcs, int running, int ident, + struct seg_desc *desc); +int vmcs_setdesc(struct vmcs *vmcs, int running, int ident, + struct seg_desc *desc); + +/* + * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h + */ +#ifdef _VMX_CPUFUNC_H_ +static __inline uint64_t +vmcs_read(uint32_t encoding) +{ + int error; + uint64_t val; + + error = vmread(encoding, &val); + KASSERT(error == 0, ("vmcs_read(%u) error %d", encoding, error)); + return (val); +} + +static __inline void +vmcs_write(uint32_t encoding, uint64_t val) +{ + int error; + + error = vmwrite(encoding, val); + KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error)); +} + +#ifndef __FreeBSD__ +/* + * Due to header complexity combined with the need to cache the physical + * address for the VMCS, these must be defined here rather than vmx_cpufunc.h. + */ +static __inline int +vmclear(struct vmcs *vmcs) +{ + int error; + uint64_t addr = vmcs->vmcs_pa; + + __asm __volatile("vmclear %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline int +vmptrld(struct vmcs *vmcs) +{ + int error; + uint64_t addr = vmcs->vmcs_pa; + + __asm __volatile("vmptrld %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline void +VMCLEAR(struct vmcs *vmcs) +{ + int err; + + err = vmclear(vmcs); + if (err != 0) + panic("%s: vmclear(%p) error %d", __func__, vmcs, err); + + critical_exit(); +} + +static __inline void +VMPTRLD(struct vmcs *vmcs) +{ + int err; + + critical_enter(); + + err = vmptrld(vmcs); + if (err != 0) + panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); +} +#endif /* __FreeBSD__ */ + +#endif /* _VMX_CPUFUNC_H_ */ + +#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) +#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP) +#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) +#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) +#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) +#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) +#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS) +#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS) +#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO) +#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR) + +#endif /* _ASM */ +#endif /* _KERNEL */ + +#define VMCS_INITIAL 0xffffffffffffffff + +#define VMCS_IDENT(encoding) ((encoding) | 0x80000000) +/* + * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. + */ +#define VMCS_INVALID_ENCODING 0xffffffff + +/* 16-bit control fields */ +#define VMCS_VPID 0x00000000 +#define VMCS_PIR_VECTOR 0x00000002 + +/* 16-bit guest-state fields */ +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080A +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C +#define VMCS_GUEST_TR_SELECTOR 0x0000080E +#define VMCS_GUEST_INTR_STATUS 0x00000810 + +/* 16-bit host-state fields */ +#define VMCS_HOST_ES_SELECTOR 0x00000C00 +#define VMCS_HOST_CS_SELECTOR 0x00000C02 +#define VMCS_HOST_SS_SELECTOR 0x00000C04 +#define VMCS_HOST_DS_SELECTOR 0x00000C06 +#define VMCS_HOST_FS_SELECTOR 0x00000C08 +#define VMCS_HOST_GS_SELECTOR 0x00000C0A +#define VMCS_HOST_TR_SELECTOR 0x00000C0C + +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x00002000 +#define VMCS_IO_BITMAP_B 0x00002002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_EXIT_MSR_STORE 0x00002006 +#define VMCS_EXIT_MSR_LOAD 0x00002008 +#define VMCS_ENTRY_MSR_LOAD 0x0000200A +#define VMCS_EXECUTIVE_VMCS 0x0000200C +#define VMCS_TSC_OFFSET 0x00002010 +#define VMCS_VIRTUAL_APIC 0x00002012 +#define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_PIR_DESC 0x00002016 +#define VMCS_EPTP 0x0000201A +#define VMCS_EOI_EXIT0 0x0000201C +#define VMCS_EOI_EXIT1 0x0000201E +#define VMCS_EOI_EXIT2 0x00002020 +#define VMCS_EOI_EXIT3 0x00002022 +#define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2) + +/* 64-bit read-only fields */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +/* 64-bit guest-state fields */ +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_PDPTE0 0x0000280A +#define VMCS_GUEST_PDPTE1 0x0000280C +#define VMCS_GUEST_PDPTE2 0x0000280E +#define VMCS_GUEST_PDPTE3 0x00002810 + +/* 64-bit host-state fields */ +#define VMCS_HOST_IA32_PAT 0x00002C00 +#define VMCS_HOST_IA32_EFER 0x00002C02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 + +/* 32-bit control fields */ +#define VMCS_PIN_BASED_CTLS 0x00004000 +#define VMCS_PRI_PROC_BASED_CTLS 0x00004002 +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PF_ERROR_MASK 0x00004006 +#define VMCS_PF_ERROR_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400A +#define VMCS_EXIT_CTLS 0x0000400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_ENTRY_CTLS 0x00004012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_ENTRY_INTR_INFO 0x00004016 +#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 +#define VMCS_ENTRY_INST_LENGTH 0x0000401A +#define VMCS_TPR_THRESHOLD 0x0000401C +#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E +#define VMCS_PLE_GAP 0x00004020 +#define VMCS_PLE_WINDOW 0x00004022 + +/* 32-bit read-only data fields */ +#define VMCS_INSTRUCTION_ERROR 0x00004400 +#define VMCS_EXIT_REASON 0x00004402 +#define VMCS_EXIT_INTR_INFO 0x00004404 +#define VMCS_EXIT_INTR_ERRCODE 0x00004406 +#define VMCS_IDT_VECTORING_INFO 0x00004408 +#define VMCS_IDT_VECTORING_ERROR 0x0000440A +#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E + +/* 32-bit guest-state fields */ +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480A +#define VMCS_GUEST_LDTR_LIMIT 0x0000480C +#define VMCS_GUEST_TR_LIMIT 0x0000480E +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 +#define VMCS_GUEST_ACTIVITY 0x00004826 +#define VMCS_GUEST_SMBASE 0x00004828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A +#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E + +/* 32-bit host state fields */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 + +/* Natural Width control fields */ +#define VMCS_CR0_MASK 0x00006000 +#define VMCS_CR4_MASK 0x00006002 +#define VMCS_CR0_SHADOW 0x00006004 +#define VMCS_CR4_SHADOW 0x00006006 +#define VMCS_CR3_TARGET0 0x00006008 +#define VMCS_CR3_TARGET1 0x0000600A +#define VMCS_CR3_TARGET2 0x0000600C +#define VMCS_CR3_TARGET3 0x0000600E + +/* Natural Width read-only fields */ +#define VMCS_EXIT_QUALIFICATION 0x00006400 +#define VMCS_IO_RCX 0x00006402 +#define VMCS_IO_RSI 0x00006404 +#define VMCS_IO_RDI 0x00006406 +#define VMCS_IO_RIP 0x00006408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A + +/* Natural Width guest-state fields */ +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680A +#define VMCS_GUEST_DS_BASE 0x0000680C +#define VMCS_GUEST_FS_BASE 0x0000680E +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681A +#define VMCS_GUEST_RSP 0x0000681C +#define VMCS_GUEST_RIP 0x0000681E +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 + +/* Natural Width host-state fields */ +#define VMCS_HOST_CR0 0x00006C00 +#define VMCS_HOST_CR3 0x00006C02 +#define VMCS_HOST_CR4 0x00006C04 +#define VMCS_HOST_FS_BASE 0x00006C06 +#define VMCS_HOST_GS_BASE 0x00006C08 +#define VMCS_HOST_TR_BASE 0x00006C0A +#define VMCS_HOST_GDTR_BASE 0x00006C0C +#define VMCS_HOST_IDTR_BASE 0x00006C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 +#define VMCS_HOST_RSP 0x00006C14 +#define VMCS_HOST_RIP 0x00006c16 + +/* + * VM instruction error numbers + */ +#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5 + +/* + * VMCS exit reasons + */ +#define EXIT_REASON_EXCEPTION 0 +#define EXIT_REASON_EXT_INTR 1 +#define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT 3 +#define EXIT_REASON_SIPI 4 +#define EXIT_REASON_IO_SMI 5 +#define EXIT_REASON_SMI 6 +#define EXIT_REASON_INTR_WINDOW 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_GETSEC 11 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_RSM 17 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMXOFF 26 +#define EXIT_REASON_VMXON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_INOUT 30 +#define EXIT_REASON_RDMSR 31 +#define EXIT_REASON_WRMSR 32 +#define EXIT_REASON_INVAL_VMCS 33 +#define EXIT_REASON_INVAL_MSR 34 +#define EXIT_REASON_MWAIT 36 +#define EXIT_REASON_MTF 37 +#define EXIT_REASON_MONITOR 39 +#define EXIT_REASON_PAUSE 40 +#define EXIT_REASON_MCE_DURING_ENTRY 41 +#define EXIT_REASON_TPR 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_VIRTUALIZED_EOI 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 +#define EXIT_REASON_EPT_FAULT 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_VMX_PREEMPT 52 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_RDRAND 57 +#define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_VMFUNC 59 +#define EXIT_REASON_ENCLS 60 +#define EXIT_REASON_RDSEED 61 +#define EXIT_REASON_PM_LOG_FULL 62 +#define EXIT_REASON_XSAVES 63 +#define EXIT_REASON_XRSTORS 64 + +/* + * NMI unblocking due to IRET. + * + * Applies to VM-exits due to hardware exception or EPT fault. + */ +#define EXIT_QUAL_NMIUDTI (1 << 12) +/* + * VMCS interrupt information fields + */ +#define VMCS_INTR_VALID (1U << 31) +#define VMCS_INTR_T_MASK 0x700 /* Interruption-info type */ +#define VMCS_INTR_T_HWINTR (0 << 8) +#define VMCS_INTR_T_NMI (2 << 8) +#define VMCS_INTR_T_HWEXCEPTION (3 << 8) +#define VMCS_INTR_T_SWINTR (4 << 8) +#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8) +#define VMCS_INTR_T_SWEXCEPTION (6 << 8) +#define VMCS_INTR_DEL_ERRCODE (1 << 11) + +/* + * VMCS IDT-Vectoring information fields + */ +#define VMCS_IDT_VEC_VALID (1U << 31) +#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11) + +/* + * VMCS Guest interruptibility field + */ +#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0) +#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1) +#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2) +#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3) + +/* + * Exit qualification for EXIT_REASON_INVAL_VMCS + */ +#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 + +/* + * Exit qualification for EPT violation + */ +#define EPT_VIOLATION_DATA_READ (1UL << 0) +#define EPT_VIOLATION_DATA_WRITE (1UL << 1) +#define EPT_VIOLATION_INST_FETCH (1UL << 2) +#define EPT_VIOLATION_GPA_READABLE (1UL << 3) +#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4) +#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5) +#define EPT_VIOLATION_GLA_VALID (1UL << 7) +#define EPT_VIOLATION_XLAT_VALID (1UL << 8) + +/* + * Exit qualification for APIC-access VM exit + */ +#define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF) +#define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF) + +/* + * Exit qualification for APIC-write VM exit + */ +#define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF) + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c new file mode 100644 index 0000000000..131615d576 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c @@ -0,0 +1,4375 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2018 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sysctl.h> + +#ifndef __FreeBSD__ +#include <sys/x86_archext.h> +#include <sys/smp_impldefs.h> +#include <sys/smt.h> +#include <sys/hma.h> +#include <sys/trap.h> +#endif + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/psl.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/reg.h> +#include <machine/segments.h> +#include <machine/smp.h> +#include <machine/specialreg.h> +#include <machine/vmparam.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> +#include "vmm_lapic.h" +#include "vmm_host.h" +#include "vmm_ioport.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "ept.h" +#include "vmx_cpufunc.h" +#include "vmcs.h" +#include "vmx.h" +#include "vmx_msr.h" +#include "x86.h" +#include "vmx_controls.h" + +#define PINBASED_CTLS_ONE_SETTING \ + (PINBASED_EXTINT_EXITING | \ + PINBASED_NMI_EXITING | \ + PINBASED_VIRTUAL_NMI) +#define PINBASED_CTLS_ZERO_SETTING 0 + +#define PROCBASED_CTLS_WINDOW_SETTING \ + (PROCBASED_INT_WINDOW_EXITING | \ + PROCBASED_NMI_WINDOW_EXITING) + +#ifdef __FreeBSD__ +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_MWAIT_EXITING | \ + PROCBASED_MONITOR_EXITING | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING | \ + PROCBASED_CR8_LOAD_EXITING | \ + PROCBASED_CR8_STORE_EXITING) +#else +/* We consider TSC offset a necessity for unsynched TSC handling */ +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_TSC_OFFSET | \ + PROCBASED_MWAIT_EXITING | \ + PROCBASED_MONITOR_EXITING | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING | \ + PROCBASED_CR8_LOAD_EXITING | \ + PROCBASED_CR8_STORE_EXITING) +#endif /* __FreeBSD__ */ + +#define PROCBASED_CTLS_ZERO_SETTING \ + (PROCBASED_CR3_LOAD_EXITING | \ + PROCBASED_CR3_STORE_EXITING | \ + PROCBASED_IO_BITMAPS) + +#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT +#define PROCBASED_CTLS2_ZERO_SETTING 0 + +#define VM_EXIT_CTLS_ONE_SETTING \ + (VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_HOST_LMA | \ + VM_EXIT_LOAD_PAT | \ + VM_EXIT_SAVE_EFER | \ + VM_EXIT_LOAD_EFER | \ + VM_EXIT_ACKNOWLEDGE_INTERRUPT) + +#define VM_EXIT_CTLS_ZERO_SETTING 0 + +#define VM_ENTRY_CTLS_ONE_SETTING \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_LOAD_EFER) + +#define VM_ENTRY_CTLS_ZERO_SETTING \ + (VM_ENTRY_INTO_SMM | \ + VM_ENTRY_DEACTIVATE_DUAL_MONITOR) + +#define HANDLED 1 +#define UNHANDLED 0 + +static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); +static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); + +#ifdef __FreeBSD__ +int vmxon_enabled[MAXCPU]; +static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); +#endif /*__FreeBSD__ */ + +static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; +static uint32_t exit_ctls, entry_ctls; + +static uint64_t cr0_ones_mask, cr0_zeros_mask; +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, + &cr0_ones_mask, 0, NULL); +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, + &cr0_zeros_mask, 0, NULL); + +static uint64_t cr4_ones_mask, cr4_zeros_mask; +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, + &cr4_ones_mask, 0, NULL); +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, + &cr4_zeros_mask, 0, NULL); + +static int vmx_initialized; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, + &vmx_initialized, 0, "Intel VMX initialized"); + +/* + * Optional capabilities + */ +SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); + +static int cap_halt_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, + "HLT triggers a VM-exit"); + +static int cap_pause_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, + 0, "PAUSE triggers a VM-exit"); + +static int cap_unrestricted_guest; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, + &cap_unrestricted_guest, 0, "Unrestricted guests"); + +static int cap_monitor_trap; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, + &cap_monitor_trap, 0, "Monitor trap flag"); + +static int cap_invpcid; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, + 0, "Guests are allowed to use INVPCID"); + +static int virtual_interrupt_delivery; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, + &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); + +static int posted_interrupts; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, + &posted_interrupts, 0, "APICv posted interrupt support"); + +static int pirvec = -1; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, + &pirvec, 0, "APICv posted interrupt vector"); + +static struct unrhdr *vpid_unr; +static u_int vpid_alloc_failed; +SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, + &vpid_alloc_failed, 0, NULL); + +static int guest_l1d_flush; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, + &guest_l1d_flush, 0, NULL); +static int guest_l1d_flush_sw; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, + &guest_l1d_flush_sw, 0, NULL); + +static struct msr_entry msr_load_list[1] __aligned(16); + +/* + * The definitions of SDT probes for VMX. + */ + +SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, + "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, + "struct vmx *", "int", "struct vm_exit *", "uint64_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, + "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, + "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, + "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, + "struct vmx *", "int", "struct vm_exit *", "uint64_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, + "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, return, + "struct vmx *", "int", "struct vm_exit *", "int"); + +/* + * Use the last page below 4GB as the APIC access address. This address is + * occupied by the boot firmware so it is guaranteed that it will not conflict + * with a page in system memory. + */ +#define APIC_ACCESS_ADDRESS 0xFFFFF000 + +static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); +static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); +static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); +static void vmx_inject_pir(struct vlapic *vlapic); +#ifndef __FreeBSD__ +static int vmx_apply_tsc_adjust(struct vmx *, int); +#endif /* __FreeBSD__ */ + +#ifdef KTR +static const char * +exit_reason_to_str(int reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case EXIT_REASON_EXCEPTION: + return "exception"; + case EXIT_REASON_EXT_INTR: + return "extint"; + case EXIT_REASON_TRIPLE_FAULT: + return "triplefault"; + case EXIT_REASON_INIT: + return "init"; + case EXIT_REASON_SIPI: + return "sipi"; + case EXIT_REASON_IO_SMI: + return "iosmi"; + case EXIT_REASON_SMI: + return "smi"; + case EXIT_REASON_INTR_WINDOW: + return "intrwindow"; + case EXIT_REASON_NMI_WINDOW: + return "nmiwindow"; + case EXIT_REASON_TASK_SWITCH: + return "taskswitch"; + case EXIT_REASON_CPUID: + return "cpuid"; + case EXIT_REASON_GETSEC: + return "getsec"; + case EXIT_REASON_HLT: + return "hlt"; + case EXIT_REASON_INVD: + return "invd"; + case EXIT_REASON_INVLPG: + return "invlpg"; + case EXIT_REASON_RDPMC: + return "rdpmc"; + case EXIT_REASON_RDTSC: + return "rdtsc"; + case EXIT_REASON_RSM: + return "rsm"; + case EXIT_REASON_VMCALL: + return "vmcall"; + case EXIT_REASON_VMCLEAR: + return "vmclear"; + case EXIT_REASON_VMLAUNCH: + return "vmlaunch"; + case EXIT_REASON_VMPTRLD: + return "vmptrld"; + case EXIT_REASON_VMPTRST: + return "vmptrst"; + case EXIT_REASON_VMREAD: + return "vmread"; + case EXIT_REASON_VMRESUME: + return "vmresume"; + case EXIT_REASON_VMWRITE: + return "vmwrite"; + case EXIT_REASON_VMXOFF: + return "vmxoff"; + case EXIT_REASON_VMXON: + return "vmxon"; + case EXIT_REASON_CR_ACCESS: + return "craccess"; + case EXIT_REASON_DR_ACCESS: + return "draccess"; + case EXIT_REASON_INOUT: + return "inout"; + case EXIT_REASON_RDMSR: + return "rdmsr"; + case EXIT_REASON_WRMSR: + return "wrmsr"; + case EXIT_REASON_INVAL_VMCS: + return "invalvmcs"; + case EXIT_REASON_INVAL_MSR: + return "invalmsr"; + case EXIT_REASON_MWAIT: + return "mwait"; + case EXIT_REASON_MTF: + return "mtf"; + case EXIT_REASON_MONITOR: + return "monitor"; + case EXIT_REASON_PAUSE: + return "pause"; + case EXIT_REASON_MCE_DURING_ENTRY: + return "mce-during-entry"; + case EXIT_REASON_TPR: + return "tpr"; + case EXIT_REASON_APIC_ACCESS: + return "apic-access"; + case EXIT_REASON_GDTR_IDTR: + return "gdtridtr"; + case EXIT_REASON_LDTR_TR: + return "ldtrtr"; + case EXIT_REASON_EPT_FAULT: + return "eptfault"; + case EXIT_REASON_EPT_MISCONFIG: + return "eptmisconfig"; + case EXIT_REASON_INVEPT: + return "invept"; + case EXIT_REASON_RDTSCP: + return "rdtscp"; + case EXIT_REASON_VMX_PREEMPT: + return "vmxpreempt"; + case EXIT_REASON_INVVPID: + return "invvpid"; + case EXIT_REASON_WBINVD: + return "wbinvd"; + case EXIT_REASON_XSETBV: + return "xsetbv"; + case EXIT_REASON_APIC_WRITE: + return "apic-write"; + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); + return (reasonbuf); + } +} +#endif /* KTR */ + +static int +vmx_allow_x2apic_msrs(struct vmx *vmx) +{ + int i, error; + + error = 0; + + /* + * Allow readonly access to the following x2APIC MSRs from the guest. + */ + error += guest_msr_ro(vmx, MSR_APIC_ID); + error += guest_msr_ro(vmx, MSR_APIC_VERSION); + error += guest_msr_ro(vmx, MSR_APIC_LDR); + error += guest_msr_ro(vmx, MSR_APIC_SVR); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); + + error += guest_msr_ro(vmx, MSR_APIC_ESR); + error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); + error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); + error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); + error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); + error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); + error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_ICR); + + /* + * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. + * + * These registers get special treatment described in the section + * "Virtualizing MSR-Based APIC Accesses". + */ + error += guest_msr_rw(vmx, MSR_APIC_TPR); + error += guest_msr_rw(vmx, MSR_APIC_EOI); + error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); + + return (error); +} + +u_long +vmx_fix_cr0(u_long cr0) +{ + + return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); +} + +u_long +vmx_fix_cr4(u_long cr4) +{ + + return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); +} + +static void +vpid_free(int vpid) +{ + if (vpid < 0 || vpid > 0xffff) + panic("vpid_free: invalid vpid %d", vpid); + + /* + * VPIDs [0,VM_MAXCPU] are special and are not allocated from + * the unit number allocator. + */ + + if (vpid > VM_MAXCPU) +#ifdef __FreeBSD__ + free_unr(vpid_unr, vpid); +#else + hma_vmx_vpid_free((uint16_t)vpid); +#endif +} + +static void +vpid_alloc(uint16_t *vpid, int num) +{ + int i, x; + + if (num <= 0 || num > VM_MAXCPU) + panic("invalid number of vpids requested: %d", num); + + /* + * If the "enable vpid" execution control is not enabled then the + * VPID is required to be 0 for all vcpus. + */ + if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { + for (i = 0; i < num; i++) + vpid[i] = 0; + return; + } + + /* + * Allocate a unique VPID for each vcpu from the unit number allocator. + */ + for (i = 0; i < num; i++) { +#ifdef __FreeBSD__ + x = alloc_unr(vpid_unr); +#else + uint16_t tmp; + + tmp = hma_vmx_vpid_alloc(); + x = (tmp == 0) ? -1 : tmp; +#endif + if (x == -1) + break; + else + vpid[i] = x; + } + + if (i < num) { + atomic_add_int(&vpid_alloc_failed, 1); + + /* + * If the unit number allocator does not have enough unique + * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. + * + * These VPIDs are not be unique across VMs but this does not + * affect correctness because the combined mappings are also + * tagged with the EP4TA which is unique for each VM. + * + * It is still sub-optimal because the invvpid will invalidate + * combined mappings for a particular VPID across all EP4TAs. + */ + while (i-- > 0) + vpid_free(vpid[i]); + + for (i = 0; i < num; i++) + vpid[i] = i + 1; + } +} + +#ifdef __FreeBSD__ +static void +vpid_init(void) +{ + /* + * VPID 0 is required when the "enable VPID" execution control is + * disabled. + * + * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the + * unit number allocator does not have sufficient unique VPIDs to + * satisfy the allocation. + * + * The remaining VPIDs are managed by the unit number allocator. + */ + vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); +} + +static void +vmx_disable(void *arg __unused) +{ + struct invvpid_desc invvpid_desc = { 0 }; + struct invept_desc invept_desc = { 0 }; + + if (vmxon_enabled[curcpu]) { + /* + * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. + * + * VMXON or VMXOFF are not required to invalidate any TLB + * caching structures. This prevents potential retention of + * cached information in the TLB between distinct VMX episodes. + */ + invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); + invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); + vmxoff(); + } + load_cr4(rcr4() & ~CR4_VMXE); +} + +static int +vmx_cleanup(void) +{ + + if (pirvec >= 0) + lapic_ipi_free(pirvec); + + if (vpid_unr != NULL) { + delete_unrhdr(vpid_unr); + vpid_unr = NULL; + } + + if (nmi_flush_l1d_sw == 1) + nmi_flush_l1d_sw = 0; + + smp_rendezvous(NULL, vmx_disable, NULL, NULL); + + return (0); +} + +static void +vmx_enable(void *arg __unused) +{ + int error; + uint64_t feature_control; + + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || + (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { + wrmsr(MSR_IA32_FEATURE_CONTROL, + feature_control | IA32_FEATURE_CONTROL_VMX_EN | + IA32_FEATURE_CONTROL_LOCK); + } + + load_cr4(rcr4() | CR4_VMXE); + + *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); + error = vmxon(vmxon_region[curcpu]); + if (error == 0) + vmxon_enabled[curcpu] = 1; +} + +static void +vmx_restore(void) +{ + + if (vmxon_enabled[curcpu]) + vmxon(vmxon_region[curcpu]); +} +#else /* __FreeBSD__ */ +static int +vmx_cleanup(void) +{ + /* This is taken care of by the hma registration */ + return (0); +} + +static void +vmx_restore(void) +{ + /* No-op on illumos */ +} +#endif /* __FreeBSD__ */ + +static int +vmx_init(int ipinum) +{ + int error, use_tpr_shadow; +#ifdef __FreeBSD__ + uint64_t basic, fixed0, fixed1, feature_control; +#else + uint64_t fixed0, fixed1; +#endif + uint32_t tmp, procbased2_vid_bits; + +#ifdef __FreeBSD__ + /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ + if (!(cpu_feature2 & CPUID2_VMX)) { + printf("vmx_init: processor does not support VMX operation\n"); + return (ENXIO); + } + + /* + * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits + * are set (bits 0 and 2 respectively). + */ + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && + (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { + printf("vmx_init: VMX operation disabled by BIOS\n"); + return (ENXIO); + } + + /* + * Verify capabilities MSR_VMX_BASIC: + * - bit 54 indicates support for INS/OUTS decoding + */ + basic = rdmsr(MSR_VMX_BASIC); + if ((basic & (1UL << 54)) == 0) { + printf("vmx_init: processor does not support desired basic " + "capabilities\n"); + return (EINVAL); + } +#endif /* __FreeBSD__ */ + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired primary " + "processor-based controls\n"); + return (error); + } + + /* Clear the processor-based ctl bits that are set on demand */ + procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); + if (error) { + printf("vmx_init: processor does not support desired secondary " + "processor-based controls\n"); + return (error); + } + + /* Check support for VPID */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_ENABLE_VPID, 0, &tmp); + if (error == 0) + procbased_ctls2 |= PROCBASED2_ENABLE_VPID; + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, + PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "pin-based controls\n"); + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, + VM_EXIT_CTLS_ZERO_SETTING, + &exit_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "exit controls\n"); + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "entry controls\n"); + return (error); + } + + /* + * Check support for optional features by testing them + * as individual bits + */ + cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_HLT_EXITING, 0, + &tmp) == 0); + + cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_PROCBASED_CTLS, + PROCBASED_MTF, 0, + &tmp) == 0); + + cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_PAUSE_EXITING, 0, + &tmp) == 0); + + cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, + &tmp) == 0); + + cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, + &tmp) == 0); + + /* + * Check support for virtual interrupt delivery. + */ + procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | + PROCBASED2_VIRTUALIZE_X2APIC_MODE | + PROCBASED2_APIC_REGISTER_VIRTUALIZATION | + PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); + + use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, + &tmp) == 0); + + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + procbased2_vid_bits, 0, &tmp); + if (error == 0 && use_tpr_shadow) { + virtual_interrupt_delivery = 1; + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", + &virtual_interrupt_delivery); + } + + if (virtual_interrupt_delivery) { + procbased_ctls |= PROCBASED_USE_TPR_SHADOW; + procbased_ctls2 |= procbased2_vid_bits; + procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; + + /* + * No need to emulate accesses to %CR8 if virtual + * interrupt delivery is enabled. + */ + procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; + procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; + + /* + * Check for Posted Interrupts only if Virtual Interrupt + * Delivery is enabled. + */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, + &tmp); + if (error == 0) { +#ifdef __FreeBSD__ + pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); + if (pirvec < 0) { + if (bootverbose) { + printf("vmx_init: unable to allocate " + "posted interrupt vector\n"); + } + } else { + posted_interrupts = 1; + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", + &posted_interrupts); + } +#else + /* + * If the PSM-provided interfaces for requesting and + * using a PIR IPI vector are present, use them for + * posted interrupts. + */ + if (psm_get_pir_ipivect != NULL && + psm_send_pir_ipi != NULL) { + pirvec = psm_get_pir_ipivect(); + posted_interrupts = 1; + } +#endif + } + } + + if (posted_interrupts) + pinbased_ctls |= PINBASED_POSTED_INTERRUPT; + + /* Initialize EPT */ + error = ept_init(ipinum); + if (error) { + printf("vmx_init: ept initialization failed (%d)\n", error); + return (error); + } + +#ifdef __FreeBSD__ + guest_l1d_flush = (cpu_ia32_arch_caps & + IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); + + /* + * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when + * available. Otherwise fall back to the software flush + * method which loads enough data from the kernel text to + * flush existing L1D content, both on VMX entry and on NMI + * return. + */ + if (guest_l1d_flush) { + if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { + guest_l1d_flush_sw = 1; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", + &guest_l1d_flush_sw); + } + if (guest_l1d_flush_sw) { + if (nmi_flush_l1d_sw <= 1) + nmi_flush_l1d_sw = 1; + } else { + msr_load_list[0].index = MSR_IA32_FLUSH_CMD; + msr_load_list[0].val = IA32_FLUSH_CMD_L1D; + } + } +#else + /* L1D flushing is taken care of by smt_acquire() and friends */ + guest_l1d_flush = 0; +#endif /* __FreeBSD__ */ + + /* + * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 + */ + fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); + cr0_ones_mask = fixed0 & fixed1; + cr0_zeros_mask = ~fixed0 & ~fixed1; + + /* + * CR0_PE and CR0_PG can be set to zero in VMX non-root operation + * if unrestricted guest execution is allowed. + */ + if (cap_unrestricted_guest) + cr0_ones_mask &= ~(CR0_PG | CR0_PE); + + /* + * Do not allow the guest to set CR0_NW or CR0_CD. + */ + cr0_zeros_mask |= (CR0_NW | CR0_CD); + + fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); + cr4_ones_mask = fixed0 & fixed1; + cr4_zeros_mask = ~fixed0 & ~fixed1; + +#ifdef __FreeBSD__ + vpid_init(); +#endif + + vmx_msr_init(); + +#ifdef __FreeBSD__ + /* enable VMX operation */ + smp_rendezvous(NULL, vmx_enable, NULL, NULL); +#endif + + vmx_initialized = 1; + + return (0); +} + +static void +vmx_trigger_hostintr(int vector) +{ +#ifdef __FreeBSD__ + uintptr_t func; + struct gate_descriptor *gd; + + gd = &idt[vector]; + + KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " + "invalid vector %d", vector)); + KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", + vector)); + KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " + "has invalid type %d", vector, gd->gd_type)); + KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " + "has invalid dpl %d", vector, gd->gd_dpl)); + KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " + "for vector %d has invalid selector %d", vector, gd->gd_selector)); + KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " + "IST %d", vector, gd->gd_ist)); + + func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); + vmx_call_isr(func); +#else + VERIFY(vector >= 32 && vector <= 255); + vmx_call_isr(vector - 32); +#endif /* __FreeBSD__ */ +} + +static int +vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) +{ + int error, mask_ident, shadow_ident; + uint64_t mask_value; + + if (which != 0 && which != 4) + panic("vmx_setup_cr_shadow: unknown cr%d", which); + + if (which == 0) { + mask_ident = VMCS_CR0_MASK; + mask_value = cr0_ones_mask | cr0_zeros_mask; + shadow_ident = VMCS_CR0_SHADOW; + } else { + mask_ident = VMCS_CR4_MASK; + mask_value = cr4_ones_mask | cr4_zeros_mask; + shadow_ident = VMCS_CR4_SHADOW; + } + + error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); + if (error) + return (error); + + error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); + if (error) + return (error); + + return (0); +} +#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) +#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) + +static void * +vmx_vminit(struct vm *vm, pmap_t pmap) +{ + uint16_t vpid[VM_MAXCPU]; + int i, error; + struct vmx *vmx; + struct vmcs *vmcs; + uint32_t exc_bitmap; + uint16_t maxcpus; + + vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); + if ((uintptr_t)vmx & PAGE_MASK) { + panic("malloc of struct vmx not aligned on %d byte boundary", + PAGE_SIZE); + } + vmx->vm = vm; + + vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); + + /* + * Clean up EPTP-tagged guest physical and combined mappings + * + * VMX transitions are not required to invalidate any guest physical + * mappings. So, it may be possible for stale guest physical mappings + * to be present in the processor TLBs. + * + * Combined mappings for this EP4TA are also invalidated for all VPIDs. + */ + ept_invalidate_mappings(vmx->eptp); + + msr_bitmap_initialize(vmx->msr_bitmap); + + /* + * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. + * The guest FSBASE and GSBASE are saved and restored during + * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are + * always restored from the vmcs host state area on vm-exit. + * + * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in + * how they are saved/restored so can be directly accessed by the + * guest. + * + * MSR_EFER is saved and restored in the guest VMCS area on a + * VM exit and entry respectively. It is also restored from the + * host VMCS area on a VM exit. + * + * The TSC MSR is exposed read-only. Writes are disallowed as + * that will impact the host TSC. If the guest does a write + * the "use TSC offsetting" execution control is enabled and the + * difference between the host TSC and the guest TSC is written + * into the TSC offset in the VMCS. + */ + if (guest_msr_rw(vmx, MSR_GSBASE) || + guest_msr_rw(vmx, MSR_FSBASE) || + guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || + guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || + guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || + guest_msr_rw(vmx, MSR_EFER) || + guest_msr_ro(vmx, MSR_TSC)) + panic("vmx_vminit: error setting guest msr access"); + + vpid_alloc(vpid, VM_MAXCPU); + + if (virtual_interrupt_delivery) { + error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, + APIC_ACCESS_ADDRESS); + /* XXX this should really return an error to the caller */ + KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); + } + + maxcpus = vm_get_maxcpus(vm); + for (i = 0; i < maxcpus; i++) { +#ifndef __FreeBSD__ + /* + * Cache physical address lookups for various components which + * may be required inside the critical_enter() section implied + * by VMPTRLD() below. + */ + vm_paddr_t msr_bitmap_pa = vtophys(vmx->msr_bitmap); + vm_paddr_t apic_page_pa = vtophys(&vmx->apic_page[i]); + vm_paddr_t pir_desc_pa = vtophys(&vmx->pir_desc[i]); +#endif /* __FreeBSD__ */ + + vmcs = &vmx->vmcs[i]; + vmcs->identifier = vmx_revision(); +#ifndef __FreeBSD__ + vmcs->vmcs_pa = (uint64_t)vtophys(vmcs); +#endif + error = vmclear(vmcs); + if (error != 0) { + panic("vmx_vminit: vmclear error %d on vcpu %d\n", + error, i); + } + + vmx_msr_guest_init(vmx, i); + + error = vmcs_init(vmcs); + KASSERT(error == 0, ("vmcs_init error %d", error)); + + VMPTRLD(vmcs); + error = 0; +#ifdef __FreeBSD__ + /* + * The illumos vmx_enter_guest implementation avoids some of + * the %rsp-manipulation games which are present in the stock + * one from FreeBSD. + */ + error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); +#endif + error += vmwrite(VMCS_EPTP, vmx->eptp); + error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); + error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); + error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); + error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); + error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); +#ifdef __FreeBSD__ + error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); +#else + error += vmwrite(VMCS_MSR_BITMAP, msr_bitmap_pa); +#endif + error += vmwrite(VMCS_VPID, vpid[i]); + + if (guest_l1d_flush && !guest_l1d_flush_sw) { + vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( + (vm_offset_t)&msr_load_list[0])); + vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, + nitems(msr_load_list)); + vmcs_write(VMCS_EXIT_MSR_STORE, 0); + vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); + } + + /* exception bitmap */ + if (vcpu_trace_exceptions(vm, i)) + exc_bitmap = 0xffffffff; + else + exc_bitmap = 1 << IDT_MC; + error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); + + vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; + error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); + + if (virtual_interrupt_delivery) { + error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); +#ifdef __FreeBSD__ + error += vmwrite(VMCS_VIRTUAL_APIC, + vtophys(&vmx->apic_page[i])); +#else + error += vmwrite(VMCS_VIRTUAL_APIC, apic_page_pa); +#endif + error += vmwrite(VMCS_EOI_EXIT0, 0); + error += vmwrite(VMCS_EOI_EXIT1, 0); + error += vmwrite(VMCS_EOI_EXIT2, 0); + error += vmwrite(VMCS_EOI_EXIT3, 0); + } + if (posted_interrupts) { + error += vmwrite(VMCS_PIR_VECTOR, pirvec); +#ifdef __FreeBSD__ + error += vmwrite(VMCS_PIR_DESC, + vtophys(&vmx->pir_desc[i])); +#else + error += vmwrite(VMCS_PIR_DESC, pir_desc_pa); +#endif + } + VMCLEAR(vmcs); + KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); + + vmx->cap[i].set = 0; + vmx->cap[i].proc_ctls = procbased_ctls; + vmx->cap[i].proc_ctls2 = procbased_ctls2; + + vmx->state[i].nextrip = ~0; + vmx->state[i].lastcpu = NOCPU; + vmx->state[i].vpid = vpid[i]; + + /* + * Set up the CR0/4 shadows, and init the read shadow + * to the power-on register value from the Intel Sys Arch. + * CR0 - 0x60000010 + * CR4 - 0 + */ + error = vmx_setup_cr0_shadow(vmcs, 0x60000010); + if (error != 0) + panic("vmx_setup_cr0_shadow %d", error); + + error = vmx_setup_cr4_shadow(vmcs, 0); + if (error != 0) + panic("vmx_setup_cr4_shadow %d", error); + + vmx->ctx[i].pmap = pmap; + } + + return (vmx); +} + +static int +vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) +{ +#ifdef __FreeBSD__ + int handled, func; + + func = vmxctx->guest_rax; +#else + int handled; +#endif + + handled = x86_emulate_cpuid(vm, vcpu, + (uint32_t*)(&vmxctx->guest_rax), + (uint32_t*)(&vmxctx->guest_rbx), + (uint32_t*)(&vmxctx->guest_rcx), + (uint32_t*)(&vmxctx->guest_rdx)); + return (handled); +} + +static __inline void +vmx_run_trace(struct vmx *vmx, int vcpu) +{ +#ifdef KTR + VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); +#endif +} + +static __inline void +vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, + int handled) +{ +#ifdef KTR + VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", + handled ? "handled" : "unhandled", + exit_reason_to_str(exit_reason), rip); +#endif + DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, rip, + uint32_t, exit_reason); +} + +static __inline void +vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) +{ +#ifdef KTR + VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); +#endif +} + +static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); +static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); + +/* + * Invalidate guest mappings identified by its vpid from the TLB. + */ +static __inline void +vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) +{ + struct vmxstate *vmxstate; + struct invvpid_desc invvpid_desc; + + vmxstate = &vmx->state[vcpu]; + if (vmxstate->vpid == 0) + return; + + if (!running) { + /* + * Set the 'lastcpu' to an invalid host cpu. + * + * This will invalidate TLB entries tagged with the vcpu's + * vpid the next time it runs via vmx_set_pcpu_defaults(). + */ + vmxstate->lastcpu = NOCPU; + return; + } + +#ifdef __FreeBSD__ + KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " + "critical section", __func__, vcpu)); +#endif + + /* + * Invalidate all mappings tagged with 'vpid' + * + * We do this because this vcpu was executing on a different host + * cpu when it last ran. We do not track whether it invalidated + * mappings associated with its 'vpid' during that run. So we must + * assume that the mappings associated with 'vpid' on 'curcpu' are + * stale and invalidate them. + * + * Note that we incur this penalty only when the scheduler chooses to + * move the thread associated with this vcpu between host cpus. + * + * Note also that this will invalidate mappings tagged with 'vpid' + * for "all" EP4TAs. + */ + if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { + invvpid_desc._res1 = 0; + invvpid_desc._res2 = 0; + invvpid_desc.vpid = vmxstate->vpid; + invvpid_desc.linear_addr = 0; + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); + } else { + /* + * The invvpid can be skipped if an invept is going to + * be performed before entering the guest. The invept + * will invalidate combined mappings tagged with + * 'vmx->eptp' for all vpids. + */ + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); + } +} + +static void +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) +{ + struct vmxstate *vmxstate; + +#ifndef __FreeBSD__ + /* + * Regardless of whether the VM appears to have migrated between CPUs, + * save the host sysenter stack pointer. As it points to the kernel + * stack of each thread, the correct value must be maintained for every + * trip into the critical section. + */ + vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR)); + + /* + * Perform any needed TSC_OFFSET adjustment based on TSC_MSR writes or + * migration between host CPUs with differing TSC values. + */ + VERIFY0(vmx_apply_tsc_adjust(vmx, vcpu)); +#endif + + vmxstate = &vmx->state[vcpu]; + if (vmxstate->lastcpu == curcpu) + return; + + vmxstate->lastcpu = curcpu; + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + +#ifndef __FreeBSD__ + /* Load the per-CPU IDT address */ + vmcs_write(VMCS_HOST_IDTR_BASE, vmm_get_host_idtrbase()); +#endif + vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); + vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); + vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + vmx_invvpid(vmx, vcpu, pmap, 1); +} + +/* + * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. + */ +CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); + +static __inline void +vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) +{ + + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); + } +} + +static __inline void +vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) +{ + + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, + ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); +} + +static __inline void +vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); + } +} + +static __inline void +vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, + ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); +} + +#ifdef __FreeBSD__ +int +vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) +{ + int error; + + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); + } + + error = vmwrite(VMCS_TSC_OFFSET, offset); + + return (error); +} +#else /* __FreeBSD__ */ +/* + * Set the TSC adjustment, taking into account the offsets measured between + * host physical CPUs. This is required even if the guest has not set a TSC + * offset since vCPUs inherit the TSC offset of whatever physical CPU it has + * migrated onto. Without this mitigation, un-synched host TSCs will convey + * the appearance of TSC time-travel to the guest as its vCPUs migrate. + */ +static int +vmx_apply_tsc_adjust(struct vmx *vmx, int vcpu) +{ + extern hrtime_t tsc_gethrtime_tick_delta(void); + const uint64_t target_offset = (vcpu_tsc_offset(vmx->vm, vcpu) + + (uint64_t)tsc_gethrtime_tick_delta()); + int error = 0; + + ASSERT(vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET); + + if (vmx->tsc_offset_active[vcpu] != target_offset) { + error = vmwrite(VMCS_TSC_OFFSET, target_offset); + vmx->tsc_offset_active[vcpu] = target_offset; + } + + return (error); +} +#endif /* __FreeBSD__ */ + +#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) +#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) + +#ifndef __FreeBSD__ +static uint32_t +vmx_inject_nmi(struct vmx *vmx, int vcpu) +#else +static void +vmx_inject_nmi(struct vmx *vmx, int vcpu) +#endif +{ + uint32_t gi, info; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " + "interruptibility-state %#x", gi)); + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " + "VM-entry interruption information %#x", info)); + + /* + * Inject the virtual NMI. The vector must be the NMI IDT entry + * or the VMCS entry check will fail. + */ + info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); + + /* Clear the request */ + vm_nmi_clear(vmx->vm, vcpu); + +#ifndef __FreeBSD__ + return (info); +#endif +} + +#ifndef __FreeBSD__ +static void +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, + uint64_t guestrip) +{ + uint64_t entryinfo, rflags; + uint32_t gi, info; + int vector; + boolean_t extint_pending = B_FALSE; + + vlapic_tmr_update(vlapic); + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + + if (vmx->state[vcpu].nextrip != guestrip && + (gi & HWINTR_BLOCKING) != 0) { + VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vmx->state[vcpu].nextrip, guestrip); + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + } + + /* + * It could be that an interrupt is already pending for injection from + * the VMCS. This would be the case if the vCPU exited for conditions + * such as an AST before a vm-entry delivered the injection. + */ + if ((info & VMCS_INTR_VALID) != 0) { + goto cantinject; + } + + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { + KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " + "intinfo is not valid: %#lx", __func__, entryinfo)); + + KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " + "pending exception: %#lx/%#x", __func__, entryinfo, info)); + + info = entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; + } + + if (info & VMCS_INTR_DEL_ERRCODE) + vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + } + + if (vm_nmi_pending(vmx->vm, vcpu)) { + int need_nmi_exiting = 1; + + /* + * If there are no conditions blocking NMI injection then + * inject it directly here otherwise enable "NMI window + * exiting" to inject it as soon as we can. + * + * We also check for STI_BLOCKING because some implementations + * don't allow NMI injection in this case. If we are running + * on a processor that doesn't have this restriction it will + * immediately exit and the NMI will be injected in the + * "NMI window exiting" handler. + */ + if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { + if ((info & VMCS_INTR_VALID) == 0) { + info = vmx_inject_nmi(vmx, vcpu); + need_nmi_exiting = 0; + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " + "due to VM-entry intr info %#x", info); + } + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " + "Guest Interruptibility-state %#x", gi); + } + + if (need_nmi_exiting) { + vmx_set_nmi_window_exiting(vmx, vcpu); + return; + } + } + + /* Check the AT-PIC and APIC for interrupts. */ + if (vm_extint_pending(vmx->vm, vcpu)) { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(vmx->vm, &vector); + extint_pending = B_TRUE; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [0,255] can be delivered + * through the INTR pin. + */ + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } else if (!virtual_interrupt_delivery) { + /* Ask the local apic for a vector to inject */ + if (!vlapic_pending_intr(vlapic, &vector)) + return; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [16,255] can be delivered + * through the local APIC. + */ + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* No futher injection needed */ + return; + } + + /* + * Verify that the guest is interruptable and the above logic has not + * already queued an event for injection. + */ + if ((gi & HWINTR_BLOCKING) != 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "Guest Interruptibility-state %#x", vector, gi); + goto cantinject; + } + if ((info & VMCS_INTR_VALID) != 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "VM-entry intr info %#x", vector, info); + goto cantinject; + } + rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if ((rflags & PSL_I) == 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, rflags); + goto cantinject; + } + + /* Inject the interrupt */ + info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; + info |= vector; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + if (extint_pending) { + vm_extint_clear(vmx->vm, vcpu); + vatpic_intr_accepted(vmx->vm, vector); + + /* + * After we accepted the current ExtINT the PIC may + * have posted another one. If that is the case, set + * the Interrupt Window Exiting execution control so + * we can inject that one too. + * + * Also, interrupt window exiting allows us to inject any + * pending APIC vector that was preempted by the ExtINT + * as soon as possible. This applies both for the software + * emulated vlapic and the hardware assisted virtual APIC. + */ + vmx_set_int_window_exiting(vmx, vcpu); + } else { + /* Update the Local APIC ISR */ + vlapic_intr_accepted(vlapic, vector); + } + + VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vmx, vcpu); +} +#else +static void +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, + uint64_t guestrip) +{ + int vector, need_nmi_exiting, extint_pending; + uint64_t rflags, entryinfo; + uint32_t gi, info; + + vlapic_tmr_update(vlapic); + + if (vmx->state[vcpu].nextrip != guestrip) { + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vmx->state[vcpu].nextrip, guestrip); + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + } + } + + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { + KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " + "intinfo is not valid: %#lx", __func__, entryinfo)); + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " + "pending exception: %#lx/%#x", __func__, entryinfo, info)); + + info = entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; + } + + if (info & VMCS_INTR_DEL_ERRCODE) + vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + } + + if (vm_nmi_pending(vmx->vm, vcpu)) { + /* + * If there are no conditions blocking NMI injection then + * inject it directly here otherwise enable "NMI window + * exiting" to inject it as soon as we can. + * + * We also check for STI_BLOCKING because some implementations + * don't allow NMI injection in this case. If we are running + * on a processor that doesn't have this restriction it will + * immediately exit and the NMI will be injected in the + * "NMI window exiting" handler. + */ + need_nmi_exiting = 1; + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + if ((info & VMCS_INTR_VALID) == 0) { + vmx_inject_nmi(vmx, vcpu); + need_nmi_exiting = 0; + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " + "due to VM-entry intr info %#x", info); + } + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " + "Guest Interruptibility-state %#x", gi); + } + + if (need_nmi_exiting) + vmx_set_nmi_window_exiting(vmx, vcpu); + } + + extint_pending = vm_extint_pending(vmx->vm, vcpu); + + if (!extint_pending && virtual_interrupt_delivery) { + vmx_inject_pir(vlapic); + return; + } + + /* + * If interrupt-window exiting is already in effect then don't bother + * checking for pending interrupts. This is just an optimization and + * not needed for correctness. + */ + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { + VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " + "pending int_window_exiting"); + return; + } + + if (!extint_pending) { + /* Ask the local apic for a vector to inject */ + if (!vlapic_pending_intr(vlapic, &vector)) + return; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [16,255] can be delivered + * through the local APIC. + */ + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(vmx->vm, &vector); + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [0,255] can be delivered + * through the INTR pin. + */ + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } + + /* Check RFLAGS.IF and the interruptibility state of the guest */ + rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if ((rflags & PSL_I) == 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, rflags); + goto cantinject; + } + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "Guest Interruptibility-state %#x", vector, gi); + goto cantinject; + } + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + if (info & VMCS_INTR_VALID) { + /* + * This is expected and could happen for multiple reasons: + * - A vectoring VM-entry was aborted due to astpending + * - A VM-exit happened during event injection. + * - An exception was injected above. + * - An NMI was injected above or after "NMI window exiting" + */ + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "VM-entry intr info %#x", vector, info); + goto cantinject; + } + + /* Inject the interrupt */ + info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; + info |= vector; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + if (!extint_pending) { + /* Update the Local APIC ISR */ + vlapic_intr_accepted(vlapic, vector); + } else { + vm_extint_clear(vmx->vm, vcpu); + vatpic_intr_accepted(vmx->vm, vector); + + /* + * After we accepted the current ExtINT the PIC may + * have posted another one. If that is the case, set + * the Interrupt Window Exiting execution control so + * we can inject that one too. + * + * Also, interrupt window exiting allows us to inject any + * pending APIC vector that was preempted by the ExtINT + * as soon as possible. This applies both for the software + * emulated vlapic and the hardware assisted virtual APIC. + */ + vmx_set_int_window_exiting(vmx, vcpu); + } + + VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); + + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vmx, vcpu); +} +#endif /* __FreeBSD__ */ + +/* + * If the Virtual NMIs execution control is '1' then the logical processor + * tracks virtual-NMI blocking in the Guest Interruptibility-state field of + * the VMCS. An IRET instruction in VMX non-root operation will remove any + * virtual-NMI blocking. + * + * This unblocking occurs even if the IRET causes a fault. In this case the + * hypervisor needs to restore virtual-NMI blocking before resuming the guest. + */ +static void +vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, + ("NMI blocking is not in effect %#x", gi)); +} + +static int +vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + struct vmxctx *vmxctx; + uint64_t xcrval; + const struct xsave_limits *limits; + + vmxctx = &vmx->ctx[vcpu]; + limits = vmm_get_xsave_limits(); + + /* + * Note that the processor raises a GP# fault on its own if + * xsetbv is executed for CPL != 0, so we do not have to + * emulate that fault here. + */ + + /* Only xcr0 is supported. */ + if (vmxctx->guest_rcx != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ + if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { + vm_inject_ud(vmx->vm, vcpu); + return (HANDLED); + } + + xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); + if ((xcrval & ~limits->xcr0_allowed) != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + if (!(xcrval & XFEATURE_ENABLED_X87)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* AVX (YMM_Hi128) requires SSE. */ + if (xcrval & XFEATURE_ENABLED_AVX && + (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, + * ZMM_Hi256, and Hi16_ZMM. + */ + if (xcrval & XFEATURE_AVX512 && + (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != + (XFEATURE_AVX512 | XFEATURE_AVX)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * Intel MPX requires both bound register state flags to be + * set. + */ + if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != + ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * This runs "inside" vmrun() with the guest's FPU state, so + * modifying xcr0 directly modifies the guest's xcr0, not the + * host's. + */ + load_xcr(0, xcrval); + return (HANDLED); +} + +static uint64_t +vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) +{ + const struct vmxctx *vmxctx; + + vmxctx = &vmx->ctx[vcpu]; + + switch (ident) { + case 0: + return (vmxctx->guest_rax); + case 1: + return (vmxctx->guest_rcx); + case 2: + return (vmxctx->guest_rdx); + case 3: + return (vmxctx->guest_rbx); + case 4: + return (vmcs_read(VMCS_GUEST_RSP)); + case 5: + return (vmxctx->guest_rbp); + case 6: + return (vmxctx->guest_rsi); + case 7: + return (vmxctx->guest_rdi); + case 8: + return (vmxctx->guest_r8); + case 9: + return (vmxctx->guest_r9); + case 10: + return (vmxctx->guest_r10); + case 11: + return (vmxctx->guest_r11); + case 12: + return (vmxctx->guest_r12); + case 13: + return (vmxctx->guest_r13); + case 14: + return (vmxctx->guest_r14); + case 15: + return (vmxctx->guest_r15); + default: + panic("invalid vmx register %d", ident); + } +} + +static void +vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) +{ + struct vmxctx *vmxctx; + + vmxctx = &vmx->ctx[vcpu]; + + switch (ident) { + case 0: + vmxctx->guest_rax = regval; + break; + case 1: + vmxctx->guest_rcx = regval; + break; + case 2: + vmxctx->guest_rdx = regval; + break; + case 3: + vmxctx->guest_rbx = regval; + break; + case 4: + vmcs_write(VMCS_GUEST_RSP, regval); + break; + case 5: + vmxctx->guest_rbp = regval; + break; + case 6: + vmxctx->guest_rsi = regval; + break; + case 7: + vmxctx->guest_rdi = regval; + break; + case 8: + vmxctx->guest_r8 = regval; + break; + case 9: + vmxctx->guest_r9 = regval; + break; + case 10: + vmxctx->guest_r10 = regval; + break; + case 11: + vmxctx->guest_r11 = regval; + break; + case 12: + vmxctx->guest_r12 = regval; + break; + case 13: + vmxctx->guest_r13 = regval; + break; + case 14: + vmxctx->guest_r14 = regval; + break; + case 15: + vmxctx->guest_r15 = regval; + break; + default: + panic("invalid vmx register %d", ident); + } +} + +static int +vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr0 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(VMCS_CR0_SHADOW, regval); + + crval = regval | cr0_ones_mask; + crval &= ~cr0_zeros_mask; + vmcs_write(VMCS_GUEST_CR0, crval); + + if (regval & CR0_PG) { + uint64_t efer, entry_ctls; + + /* + * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and + * the "IA-32e mode guest" bit in VM-entry control must be + * equal. + */ + efer = vmcs_read(VMCS_GUEST_IA32_EFER); + if (efer & EFER_LME) { + efer |= EFER_LMA; + vmcs_write(VMCS_GUEST_IA32_EFER, efer); + entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); + entry_ctls |= VM_ENTRY_GUEST_LMA; + vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); + } + } + + return (HANDLED); +} + +static int +vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr4 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(VMCS_CR4_SHADOW, regval); + + crval = regval | cr4_ones_mask; + crval &= ~cr4_zeros_mask; + vmcs_write(VMCS_GUEST_CR4, crval); + + return (HANDLED); +} + +static int +vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + struct vlapic *vlapic; + uint64_t cr8; + int regnum; + + /* We only handle mov %cr8 to/from a register at this time. */ + if ((exitqual & 0xe0) != 0x00) { + return (UNHANDLED); + } + + vlapic = vm_lapic(vmx->vm, vcpu); + regnum = (exitqual >> 8) & 0xf; + if (exitqual & 0x10) { + cr8 = vlapic_get_cr8(vlapic); + vmx_set_guest_reg(vmx, vcpu, regnum, cr8); + } else { + cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); + vlapic_set_cr8(vlapic, cr8); + } + + return (HANDLED); +} + +/* + * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL + */ +static int +vmx_cpl(void) +{ + uint32_t ssar; + + ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); + return ((ssar >> 5) & 0x3); +} + +static enum vm_cpu_mode +vmx_cpu_mode(void) +{ + uint32_t csar; + + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + if (csar & 0x2000) + return (CPU_MODE_64BIT); /* CS.L = 1 */ + else + return (CPU_MODE_COMPATIBILITY); + } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +vmx_paging_mode(void) +{ + + if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) + return (PAGING_MODE_FLAT); + if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) + return (PAGING_MODE_32); + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +static uint64_t +inout_str_index(struct vmx *vmx, int vcpuid, int in) +{ + uint64_t val; + int error; + enum vm_reg_name reg; + + reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + error = vmx_getreg(vmx, vcpuid, reg, &val); + KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); + return (val); +} + +static uint64_t +inout_str_count(struct vmx *vmx, int vcpuid, int rep) +{ + uint64_t val; + int error; + + if (rep) { + error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); + KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); + } else { + val = 1; + } + return (val); +} + +static int +inout_str_addrsize(uint32_t inst_info) +{ + uint32_t size; + + size = (inst_info >> 7) & 0x7; + switch (size) { + case 0: + return (2); /* 16 bit */ + case 1: + return (4); /* 32 bit */ + case 2: + return (8); /* 64 bit */ + default: + panic("%s: invalid size encoding %d", __func__, size); + } +} + +static void +inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, + struct vm_inout_str *vis) +{ + int error, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else { + s = (inst_info >> 15) & 0x7; + vis->seg_name = vm_segment_name(s); + } + + error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); +} + +static void +vmx_paging_info(struct vm_guest_paging *paging) +{ + paging->cr3 = vmcs_guest_cr3(); + paging->cpl = vmx_cpl(); + paging->cpu_mode = vmx_cpu_mode(); + paging->paging_mode = vmx_paging_mode(); +} + +static void +vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) +{ + struct vm_guest_paging *paging; + uint32_t csar; + + paging = &vmexit->u.inst_emul.paging; + + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->inst_length = 0; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = gla; + vmx_paging_info(paging); + switch (paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + vmexit->u.inst_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); + break; + default: + vmexit->u.inst_emul.cs_base = 0; + vmexit->u.inst_emul.cs_d = 0; + break; + } + vie_init(&vmexit->u.inst_emul.vie, NULL, 0); +} + +static int +ept_fault_type(uint64_t ept_qual) +{ + int fault_type; + + if (ept_qual & EPT_VIOLATION_DATA_WRITE) + fault_type = VM_PROT_WRITE; + else if (ept_qual & EPT_VIOLATION_INST_FETCH) + fault_type = VM_PROT_EXECUTE; + else + fault_type= VM_PROT_READ; + + return (fault_type); +} + +static boolean_t +ept_emulation_fault(uint64_t ept_qual) +{ + int read, write; + + /* EPT fault on an instruction fetch doesn't make sense here */ + if (ept_qual & EPT_VIOLATION_INST_FETCH) + return (FALSE); + + /* EPT fault must be a read fault or a write fault */ + read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; + write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; + if ((read | write) == 0) + return (FALSE); + + /* + * The EPT violation must have been caused by accessing a + * guest-physical address that is a translation of a guest-linear + * address. + */ + if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || + (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { + return (FALSE); + } + + return (TRUE); +} + +static __inline int +apic_access_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); +} + +static __inline int +x2apic_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); +} + +static int +vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, + uint64_t qual) +{ + int error, handled, offset; + uint32_t *apic_regs, vector; + bool retu; + + handled = HANDLED; + offset = APIC_WRITE_OFFSET(qual); + + if (!apic_access_virtualization(vmx, vcpuid)) { + /* + * In general there should not be any APIC write VM-exits + * unless APIC-access virtualization is enabled. + * + * However self-IPI virtualization can legitimately trigger + * an APIC-write VM-exit so treat it specially. + */ + if (x2apic_virtualization(vmx, vcpuid) && + offset == APIC_OFFSET_SELF_IPI) { + apic_regs = (uint32_t *)(vlapic->apic_page); + vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; + vlapic_self_ipi_handler(vlapic, vector); + return (HANDLED); + } else + return (UNHANDLED); + } + + switch (offset) { + case APIC_OFFSET_ID: + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_LDR: + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + retu = false; + error = vlapic_icrlo_write_handler(vlapic, &retu); + if (error != 0 || retu) + handled = UNHANDLED; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + vlapic_icrtmr_write_handler(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + vlapic_dcr_write_handler(vlapic); + break; + default: + handled = UNHANDLED; + break; + } + return (handled); +} + +static bool +apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) +{ + + if (apic_access_virtualization(vmx, vcpuid) && + (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) + return (true); + else + return (false); +} + +static int +vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint64_t qual; + int access_type, offset, allowed; + + if (!apic_access_virtualization(vmx, vcpuid)) + return (UNHANDLED); + + qual = vmexit->u.vmx.exit_qualification; + access_type = APIC_ACCESS_TYPE(qual); + offset = APIC_ACCESS_OFFSET(qual); + + allowed = 0; + if (access_type == 0) { + /* + * Read data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } else if (access_type == 1) { + /* + * Write data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } + + if (allowed) { + vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, + VIE_INVALID_GLA); + } + + /* + * Regardless of whether the APIC-access is allowed this handler + * always returns UNHANDLED: + * - if the access is allowed then it is handled by emulating the + * instruction that caused the VM-exit (outside the critical section) + * - if the access is not allowed then it will be converted to an + * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. + */ + return (UNHANDLED); +} + +static enum task_switch_reason +vmx_task_switch_reason(uint64_t qual) +{ + int reason; + + reason = (qual >> 30) & 0x3; + switch (reason) { + case 0: + return (TSR_CALL); + case 1: + return (TSR_IRET); + case 2: + return (TSR_JMP); + case 3: + return (TSR_IDT_GATE); + default: + panic("%s: invalid reason %d", __func__, reason); + } +} + +static int +emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) +{ + int error; + + if (lapic_msr(num)) + error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); + else + error = vmx_wrmsr(vmx, vcpuid, num, val, retu); + + return (error); +} + +static int +emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) +{ + struct vmxctx *vmxctx; + uint64_t result; + uint32_t eax, edx; + int error; + + if (lapic_msr(num)) + error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); + else + error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); + + if (error == 0) { + eax = result; + vmxctx = &vmx->ctx[vcpuid]; + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); + KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); + + edx = result >> 32; + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); + KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); + } + + return (error); +} + +#ifndef __FreeBSD__ +#define __predict_false(x) (x) +#endif + +static int +vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + int error, errcode, errcode_valid, handled, in; + struct vmxctx *vmxctx; + struct vlapic *vlapic; + struct vm_inout_str *vis; + struct vm_task_switch *ts; + uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; + uint32_t intr_type, intr_vec, reason; + uint64_t exitintinfo, qual, gpa; + bool retu; + + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); + + handled = UNHANDLED; + vmxctx = &vmx->ctx[vcpu]; + + qual = vmexit->u.vmx.exit_qualification; + reason = vmexit->u.vmx.exit_reason; + vmexit->exitcode = VM_EXITCODE_BOGUS; + + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); + SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); + + /* + * VM-entry failures during or after loading guest state. + * + * These VM-exits are uncommon but must be handled specially + * as most VM-exit fields are not populated as usual. + */ + if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { + VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); +#ifdef __FreeBSD__ + __asm __volatile("int $18"); +#else + vmm_call_trap(T_MCE); +#endif + return (1); + } + + /* + * VM exits that can be triggered during event delivery need to + * be handled specially by re-injecting the event if the IDT + * vectoring information field's valid bit is set. + * + * See "Information for VM Exits During Event Delivery" in Intel SDM + * for details. + */ + idtvec_info = vmcs_idt_vectoring_info(); + if (idtvec_info & VMCS_IDT_VEC_VALID) { + idtvec_info &= ~(1 << 12); /* clear undefined bit */ + exitintinfo = idtvec_info; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + idtvec_err = vmcs_idt_vectoring_err(); + exitintinfo |= (uint64_t)idtvec_err << 32; + } + error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); + KASSERT(error == 0, ("%s: vm_set_intinfo error %d", + __func__, error)); + + /* + * If 'virtual NMIs' are being used and the VM-exit + * happened while injecting an NMI during the previous + * VM-entry, then clear "blocking by NMI" in the + * Guest Interruptibility-State so the NMI can be + * reinjected on the subsequent VM-entry. + * + * However, if the NMI was being delivered through a task + * gate, then the new task must start execution with NMIs + * blocked so don't clear NMI blocking in this case. + */ + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type == VMCS_INTR_T_NMI) { + if (reason != EXIT_REASON_TASK_SWITCH) + vmx_clear_nmi_blocking(vmx, vcpu); + else + vmx_assert_nmi_blocking(vmx, vcpu); + } + + /* + * Update VM-entry instruction length if the event being + * delivered was a software interrupt or software exception. + */ + if (intr_type == VMCS_INTR_T_SWINTR || + intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || + intr_type == VMCS_INTR_T_SWEXCEPTION) { + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + } + } + + switch (reason) { + case EXIT_REASON_TASK_SWITCH: + ts = &vmexit->u.task_switch; + ts->tsssel = qual & 0xffff; + ts->reason = vmx_task_switch_reason(qual); + ts->ext = 0; + ts->errcode_valid = 0; + vmx_paging_info(&ts->paging); + /* + * If the task switch was due to a CALL, JMP, IRET, software + * interrupt (INT n) or software exception (INT3, INTO), + * then the saved %rip references the instruction that caused + * the task switch. The instruction length field in the VMCS + * is valid in this case. + * + * In all other cases (e.g., NMI, hardware exception) the + * saved %rip is one that would have been saved in the old TSS + * had the task switch completed normally so the instruction + * length field is not needed in this case and is explicitly + * set to 0. + */ + if (ts->reason == TSR_IDT_GATE) { + KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, + ("invalid idtvec_info %#x for IDT task switch", + idtvec_info)); + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type != VMCS_INTR_T_SWINTR && + intr_type != VMCS_INTR_T_SWEXCEPTION && + intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { + /* Task switch triggered by external event */ + ts->ext = 1; + vmexit->inst_length = 0; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + ts->errcode_valid = 1; + ts->errcode = vmcs_idt_vectoring_err(); + } + } + } + vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; + SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); + VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " + "%s errcode 0x%016lx", ts->reason, ts->tsssel, + ts->ext ? "external" : "internal", + ((uint64_t)ts->errcode << 32) | ts->errcode_valid); + break; + case EXIT_REASON_CR_ACCESS: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); + SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); + switch (qual & 0xf) { + case 0: + handled = vmx_emulate_cr0_access(vmx, vcpu, qual); + break; + case 4: + handled = vmx_emulate_cr4_access(vmx, vcpu, qual); + break; + case 8: + handled = vmx_emulate_cr8_access(vmx, vcpu, qual); + break; + } + break; + case EXIT_REASON_RDMSR: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); + retu = false; + ecx = vmxctx->guest_rcx; + VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); + SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); + error = emulate_rdmsr(vmx, vcpu, ecx, &retu); + if (error) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } else if (!retu) { + handled = HANDLED; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_rdmsr retu with bogus exitcode")); + } + break; + case EXIT_REASON_WRMSR: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); + retu = false; + eax = vmxctx->guest_rax; + ecx = vmxctx->guest_rcx; + edx = vmxctx->guest_rdx; + VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", + ecx, (uint64_t)edx << 32 | eax); + SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, + (uint64_t)edx << 32 | eax); + error = emulate_wrmsr(vmx, vcpu, ecx, + (uint64_t)edx << 32 | eax, &retu); + if (error) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; + } else if (!retu) { + handled = HANDLED; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } + break; + case EXIT_REASON_HLT: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); + SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if (virtual_interrupt_delivery) + vmexit->u.hlt.intr_status = + vmcs_read(VMCS_GUEST_INTR_STATUS); + else + vmexit->u.hlt.intr_status = 0; + break; + case EXIT_REASON_MTF: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); + SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_MTRAP; + vmexit->inst_length = 0; + break; + case EXIT_REASON_PAUSE: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); + SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_PAUSE; + break; + case EXIT_REASON_INTR_WINDOW: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); + SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); + vmx_clear_int_window_exiting(vmx, vcpu); + return (1); + case EXIT_REASON_EXT_INTR: + /* + * External interrupts serve only to cause VM exits and allow + * the host interrupt handler to run. + * + * If this external interrupt triggers a virtual interrupt + * to a VM, then that state will be recorded by the + * host interrupt handler in the VM's softc. We will inject + * this virtual interrupt during the subsequent VM enter. + */ + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + SDT_PROBE4(vmm, vmx, exit, interrupt, + vmx, vcpu, vmexit, intr_info); + + /* + * XXX: Ignore this exit if VMCS_INTR_VALID is not set. + * This appears to be a bug in VMware Fusion? + */ + if (!(intr_info & VMCS_INTR_VALID)) + return (1); + KASSERT((intr_info & VMCS_INTR_VALID) != 0 && + (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, + ("VM exit interruption info invalid: %#x", intr_info)); + vmx_trigger_hostintr(intr_info & 0xff); + + /* + * This is special. We want to treat this as an 'handled' + * VM-exit but not increment the instruction pointer. + */ + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); + return (1); + case EXIT_REASON_NMI_WINDOW: + SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); + /* Exit to allow the pending virtual NMI to be injected */ + if (vm_nmi_pending(vmx->vm, vcpu)) + vmx_inject_nmi(vmx, vcpu); + vmx_clear_nmi_window_exiting(vmx, vcpu); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); + return (1); + case EXIT_REASON_INOUT: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.bytes = (qual & 0x7) + 1; + vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; + vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; + vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; + vmexit->u.inout.port = (uint16_t)(qual >> 16); + vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); + if (vmexit->u.inout.string) { + inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + vmx_paging_info(&vis->paging); + vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); + vis->cr0 = vmcs_read(VMCS_GUEST_CR0); + vis->index = inout_str_index(vmx, vcpu, in); + vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); + vis->addrsize = inout_str_addrsize(inst_info); + inout_str_seginfo(vmx, vcpu, inst_info, in, vis); + } + SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); + break; + case EXIT_REASON_CPUID: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); + SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); + handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); + break; + case EXIT_REASON_EXCEPTION: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %#x", intr_info)); + + intr_vec = intr_info & 0xff; + intr_type = intr_info & VMCS_INTR_T_MASK; + + /* + * If Virtual NMIs control is 1 and the VM-exit is due to a + * fault encountered during the execution of IRET then we must + * restore the state of "virtual-NMI blocking" before resuming + * the guest. + * + * See "Resuming Guest Software after Handling an Exception". + * See "Information for VM Exits Due to Vectored Events". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (intr_vec != IDT_DF) && + (intr_info & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + + /* + * The NMI has already been handled in vmx_exit_handle_nmi(). + */ + if (intr_type == VMCS_INTR_T_NMI) + return (1); + + /* + * Call the machine check handler by hand. Also don't reflect + * the machine check back into the guest. + */ + if (intr_vec == IDT_MC) { + VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); +#ifdef __FreeBSD__ + __asm __volatile("int $18"); +#else + vmm_call_trap(T_MCE); +#endif + return (1); + } + + if (intr_vec == IDT_PF) { + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); + KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", + __func__, error)); + } + + /* + * Software exceptions exhibit trap-like behavior. This in + * turn requires populating the VM-entry instruction length + * so that the %rip in the trap frame is past the INT3/INTO + * instruction. + */ + if (intr_type == VMCS_INTR_T_SWEXCEPTION) + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + + /* Reflect all other exceptions back into the guest */ + errcode_valid = errcode = 0; + if (intr_info & VMCS_INTR_DEL_ERRCODE) { + errcode_valid = 1; + errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); + } + VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " + "the guest", intr_vec, errcode); + SDT_PROBE5(vmm, vmx, exit, exception, + vmx, vcpu, vmexit, intr_vec, errcode); + error = vm_inject_exception(vmx->vm, vcpu, intr_vec, + errcode_valid, errcode, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + return (1); + + case EXIT_REASON_EPT_FAULT: + /* + * If 'gpa' lies within the address space allocated to + * memory then this must be a nested page fault otherwise + * this must be an instruction that accesses MMIO space. + */ + gpa = vmcs_gpa(); + if (vm_mem_allocated(vmx->vm, vcpu, gpa) || + apic_access_fault(vmx, vcpu, gpa)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->inst_length = 0; + vmexit->u.paging.gpa = gpa; + vmexit->u.paging.fault_type = ept_fault_type(qual); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + SDT_PROBE5(vmm, vmx, exit, nestedfault, + vmx, vcpu, vmexit, gpa, qual); + } else if (ept_emulation_fault(qual)) { + vmexit_inst_emul(vmexit, gpa, vmcs_gla()); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); + SDT_PROBE4(vmm, vmx, exit, mmiofault, + vmx, vcpu, vmexit, gpa); + } + /* + * If Virtual NMIs control is 1 and the VM-exit is due to an + * EPT fault during the execution of IRET then we must restore + * the state of "virtual-NMI blocking" before resuming. + * + * See description of "NMI unblocking due to IRET" in + * "Exit Qualification for EPT Violations". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (qual & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + break; + case EXIT_REASON_VIRTUALIZED_EOI: + vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; + vmexit->u.ioapic_eoi.vector = qual & 0xFF; + SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); + vmexit->inst_length = 0; /* trap-like */ + break; + case EXIT_REASON_APIC_ACCESS: + SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); + handled = vmx_handle_apic_access(vmx, vcpu, vmexit); + break; + case EXIT_REASON_APIC_WRITE: + /* + * APIC-write VM exit is trap-like so the %rip is already + * pointing to the next instruction. + */ + vmexit->inst_length = 0; + vlapic = vm_lapic(vmx->vm, vcpu); + SDT_PROBE4(vmm, vmx, exit, apicwrite, + vmx, vcpu, vmexit, vlapic); + handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); + break; + case EXIT_REASON_XSETBV: + SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); + handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); + break; + case EXIT_REASON_MONITOR: + SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case EXIT_REASON_MWAIT: + SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + case EXIT_REASON_VMCALL: + case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: + case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: + case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMXOFF: + case EXIT_REASON_VMXON: + SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_VMINSN; + break; + default: + SDT_PROBE4(vmm, vmx, exit, unknown, + vmx, vcpu, vmexit, reason); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + if (handled) { + /* + * It is possible that control is returned to userland + * even though we were able to handle the VM exit in the + * kernel. + * + * In such a case we want to make sure that the userland + * restarts guest execution at the instruction *after* + * the one we just processed. Therefore we update the + * guest rip in the VMCS and in 'vmexit'. + */ + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + vmcs_write(VMCS_GUEST_RIP, vmexit->rip); + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic VMX exit. + */ + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = VM_SUCCESS; + vmexit->u.vmx.inst_type = 0; + vmexit->u.vmx.inst_error = 0; + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + + SDT_PROBE4(vmm, vmx, exit, return, + vmx, vcpu, vmexit, handled); + return (handled); +} + +static void +vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) +{ + + KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, + ("vmx_exit_inst_error: invalid inst_fail_status %d", + vmxctx->inst_fail_status)); + + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = vmxctx->inst_fail_status; + vmexit->u.vmx.inst_error = vmcs_instruction_error(); + vmexit->u.vmx.exit_reason = ~0; + vmexit->u.vmx.exit_qualification = ~0; + + switch (rc) { + case VMX_VMRESUME_ERROR: + case VMX_VMLAUNCH_ERROR: + case VMX_INVEPT_ERROR: +#ifndef __FreeBSD__ + case VMX_VMWRITE_ERROR: +#endif + vmexit->u.vmx.inst_type = rc; + break; + default: + panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); + } +} + +/* + * If the NMI-exiting VM execution control is set to '1' then an NMI in + * non-root operation causes a VM-exit. NMI blocking is in effect so it is + * sufficient to simply vector to the NMI handler via a software interrupt. + * However, this must be done before maskable interrupts are enabled + * otherwise the "iret" issued by an interrupt handler will incorrectly + * clear NMI blocking. + */ +static __inline void +vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint32_t intr_info; + + KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); + + if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) + return; + + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %#x", intr_info)); + + if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { + KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " + "to NMI has invalid vector: %#x", intr_info)); + VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); +#ifdef __FreeBSD__ + __asm __volatile("int $2"); +#else + vmm_call_trap(T_NMIFLT); +#endif + } +} + +static __inline void +vmx_dr_enter_guest(struct vmxctx *vmxctx) +{ + register_t rflags; + + /* Save host control debug registers. */ + vmxctx->host_dr7 = rdr7(); + vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); + + /* + * Disable debugging in DR7 and DEBUGCTL to avoid triggering + * exceptions in the host based on the guest DRx values. The + * guest DR7 and DEBUGCTL are saved/restored in the VMCS. + */ + load_dr7(0); + wrmsr(MSR_DEBUGCTLMSR, 0); + + /* + * Disable single stepping the kernel to avoid corrupting the + * guest DR6. A debugger might still be able to corrupt the + * guest DR6 by setting a breakpoint after this point and then + * single stepping. + */ + rflags = read_rflags(); + vmxctx->host_tf = rflags & PSL_T; + write_rflags(rflags & ~PSL_T); + + /* Save host debug registers. */ + vmxctx->host_dr0 = rdr0(); + vmxctx->host_dr1 = rdr1(); + vmxctx->host_dr2 = rdr2(); + vmxctx->host_dr3 = rdr3(); + vmxctx->host_dr6 = rdr6(); + + /* Restore guest debug registers. */ + load_dr0(vmxctx->guest_dr0); + load_dr1(vmxctx->guest_dr1); + load_dr2(vmxctx->guest_dr2); + load_dr3(vmxctx->guest_dr3); + load_dr6(vmxctx->guest_dr6); +} + +static __inline void +vmx_dr_leave_guest(struct vmxctx *vmxctx) +{ + + /* Save guest debug registers. */ + vmxctx->guest_dr0 = rdr0(); + vmxctx->guest_dr1 = rdr1(); + vmxctx->guest_dr2 = rdr2(); + vmxctx->guest_dr3 = rdr3(); + vmxctx->guest_dr6 = rdr6(); + + /* + * Restore host debug registers. Restore DR7, DEBUGCTL, and + * PSL_T last. + */ + load_dr0(vmxctx->host_dr0); + load_dr1(vmxctx->host_dr1); + load_dr2(vmxctx->host_dr2); + load_dr3(vmxctx->host_dr3); + load_dr6(vmxctx->host_dr6); + wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); + load_dr7(vmxctx->host_dr7); + write_rflags(read_rflags() | vmxctx->host_tf); +} + +static int +vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, + struct vm_eventinfo *evinfo) +{ + int rc, handled, launched; + struct vmx *vmx; + struct vm *vm; + struct vmxctx *vmxctx; + struct vmcs *vmcs; + struct vm_exit *vmexit; + struct vlapic *vlapic; + uint32_t exit_reason; +#ifdef __FreeBSD__ + struct region_descriptor gdtr, idtr; + uint16_t ldt_sel; +#endif + + vmx = arg; + vm = vmx->vm; + vmcs = &vmx->vmcs[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + vlapic = vm_lapic(vm, vcpu); + vmexit = vm_exitinfo(vm, vcpu); + launched = 0; + + KASSERT(vmxctx->pmap == pmap, + ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); + + vmx_msr_guest_enter(vmx, vcpu); + + VMPTRLD(vmcs); + +#ifndef __FreeBSD__ + VERIFY(vmx->vmcs_state[vcpu] == VS_NONE && curthread->t_preempt != 0); + vmx->vmcs_state[vcpu] = VS_LOADED; +#endif + + /* + * XXX + * We do this every time because we may setup the virtual machine + * from a different process than the one that actually runs it. + * + * If the life of a virtual machine was spent entirely in the context + * of a single process we could do this once in vmx_vminit(). + */ + vmcs_write(VMCS_HOST_CR3, rcr3()); + + vmcs_write(VMCS_GUEST_RIP, rip); + vmx_set_pcpu_defaults(vmx, vcpu, pmap); + do { + KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " + "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); + + handled = UNHANDLED; + /* + * Interrupts are disabled from this point on until the + * guest starts executing. This is done for the following + * reasons: + * + * If an AST is asserted on this thread after the check below, + * then the IPI_AST notification will not be lost, because it + * will cause a VM exit due to external interrupt as soon as + * the guest state is loaded. + * + * A posted interrupt after 'vmx_inject_interrupts()' will + * not be "lost" because it will be held pending in the host + * APIC because interrupts are disabled. The pending interrupt + * will be recognized as soon as the guest state is loaded. + * + * The same reasoning applies to the IPI generated by + * pmap_invalidate_ept(). + */ +#ifdef __FreeBSD__ + disable_intr(); + vmx_inject_interrupts(vmx, vcpu, vlapic, rip); +#else + /* + * The bulk of guest interrupt injection is done without + * interrupts disabled on the host CPU. This is necessary + * since contended mutexes might force the thread to sleep. + */ + vmx_inject_interrupts(vmx, vcpu, vlapic, rip); + disable_intr(); + if (virtual_interrupt_delivery) { + vmx_inject_pir(vlapic); + } +#endif /* __FreeBSD__ */ + + /* + * Check for vcpu suspension after injecting events because + * vmx_inject_interrupts() can suspend the vcpu due to a + * triple fault. + */ + if (vcpu_suspended(evinfo)) { + enable_intr(); + vm_exit_suspended(vmx->vm, vcpu, rip); + break; + } + + if (vcpu_runblocked(evinfo)) { + enable_intr(); + vm_exit_runblock(vmx->vm, vcpu, rip); + break; + } + + if (vcpu_reqidle(evinfo)) { + enable_intr(); + vm_exit_reqidle(vmx->vm, vcpu, rip); + break; + } + + if (vcpu_should_yield(vm, vcpu)) { + enable_intr(); + vm_exit_astpending(vmx->vm, vcpu, rip); + vmx_astpending_trace(vmx, vcpu, rip); + handled = HANDLED; + break; + } + + if (vcpu_debugged(vm, vcpu)) { + enable_intr(); + vm_exit_debug(vmx->vm, vcpu, rip); + break; + } + +#ifndef __FreeBSD__ + if ((rc = smt_acquire()) != 1) { + enable_intr(); + vmexit->rip = rip; + vmexit->inst_length = 0; + if (rc == -1) { + vmexit->exitcode = VM_EXITCODE_HT; + } else { + vmexit->exitcode = VM_EXITCODE_BOGUS; + handled = HANDLED; + } + break; + } + + /* + * If this thread has gone off-cpu due to mutex operations + * during vmx_run, the VMCS will have been unloaded, forcing a + * re-VMLAUNCH as opposed to VMRESUME. + */ + launched = (vmx->vmcs_state[vcpu] & VS_LAUNCHED) != 0; + /* + * Restoration of the GDT limit is taken care of by + * vmx_savectx(). Since the maximum practical index for the + * IDT is 255, restoring its limits from the post-VMX-exit + * default of 0xffff is not a concern. + * + * Only 64-bit hypervisor callers are allowed, which forgoes + * the need to restore any LDT descriptor. Toss an error to + * anyone attempting to break that rule. + */ + if (curproc->p_model != DATAMODEL_LP64) { + smt_release(); + enable_intr(); + bzero(vmexit, sizeof (*vmexit)); + vmexit->rip = rip; + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = VM_FAIL_INVALID; + handled = UNHANDLED; + break; + } +#else + /* + * VM exits restore the base address but not the + * limits of GDTR and IDTR. The VMCS only stores the + * base address, so VM exits set the limits to 0xffff. + * Save and restore the full GDTR and IDTR to restore + * the limits. + * + * The VMCS does not save the LDTR at all, and VM + * exits clear LDTR as if a NULL selector were loaded. + * The userspace hypervisor probably doesn't use a + * LDT, but save and restore it to be safe. + */ + sgdt(&gdtr); + sidt(&idtr); + ldt_sel = sldt(); +#endif + + vmx_run_trace(vmx, vcpu); + vmx_dr_enter_guest(vmxctx); + rc = vmx_enter_guest(vmxctx, vmx, launched); + vmx_dr_leave_guest(vmxctx); + +#ifndef __FreeBSD__ + vmx->vmcs_state[vcpu] |= VS_LAUNCHED; + smt_release(); +#else + bare_lgdt(&gdtr); + lidt(&idtr); + lldt(ldt_sel); +#endif + + /* Collect some information for VM exit processing */ + vmexit->rip = rip = vmcs_guest_rip(); + vmexit->inst_length = vmexit_instruction_length(); + vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); + vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); + + /* Update 'nextrip' */ + vmx->state[vcpu].nextrip = rip; + + if (rc == VMX_GUEST_VMEXIT) { + vmx_exit_handle_nmi(vmx, vcpu, vmexit); + enable_intr(); + handled = vmx_exit_process(vmx, vcpu, vmexit); + } else { + enable_intr(); + vmx_exit_inst_error(vmxctx, rc, vmexit); + } +#ifdef __FreeBSD__ + launched = 1; +#endif + vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); + rip = vmexit->rip; + } while (handled); + + /* + * If a VM exit has been handled then the exitcode must be BOGUS + * If a VM exit is not handled then the exitcode must not be BOGUS + */ + if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || + (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { + panic("Mismatch between handled (%d) and exitcode (%d)", + handled, vmexit->exitcode); + } + + if (!handled) + vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); + + VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", + vmexit->exitcode); + + VMCLEAR(vmcs); + vmx_msr_guest_exit(vmx, vcpu); + +#ifndef __FreeBSD__ + VERIFY(vmx->vmcs_state != VS_NONE && curthread->t_preempt != 0); + vmx->vmcs_state[vcpu] = VS_NONE; +#endif + + return (0); +} + +static void +vmx_vmcleanup(void *arg) +{ + int i; + struct vmx *vmx = arg; + uint16_t maxcpus; + + if (apic_access_virtualization(vmx, 0)) + vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); + + maxcpus = vm_get_maxcpus(vmx->vm); + for (i = 0; i < maxcpus; i++) + vpid_free(vmx->state[i].vpid); + + free(vmx, M_VMX); + + return; +} + +static register_t * +vmxctx_regptr(struct vmxctx *vmxctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RAX: + return (&vmxctx->guest_rax); + case VM_REG_GUEST_RBX: + return (&vmxctx->guest_rbx); + case VM_REG_GUEST_RCX: + return (&vmxctx->guest_rcx); + case VM_REG_GUEST_RDX: + return (&vmxctx->guest_rdx); + case VM_REG_GUEST_RSI: + return (&vmxctx->guest_rsi); + case VM_REG_GUEST_RDI: + return (&vmxctx->guest_rdi); + case VM_REG_GUEST_RBP: + return (&vmxctx->guest_rbp); + case VM_REG_GUEST_R8: + return (&vmxctx->guest_r8); + case VM_REG_GUEST_R9: + return (&vmxctx->guest_r9); + case VM_REG_GUEST_R10: + return (&vmxctx->guest_r10); + case VM_REG_GUEST_R11: + return (&vmxctx->guest_r11); + case VM_REG_GUEST_R12: + return (&vmxctx->guest_r12); + case VM_REG_GUEST_R13: + return (&vmxctx->guest_r13); + case VM_REG_GUEST_R14: + return (&vmxctx->guest_r14); + case VM_REG_GUEST_R15: + return (&vmxctx->guest_r15); + case VM_REG_GUEST_CR2: + return (&vmxctx->guest_cr2); + case VM_REG_GUEST_DR0: + return (&vmxctx->guest_dr0); + case VM_REG_GUEST_DR1: + return (&vmxctx->guest_dr1); + case VM_REG_GUEST_DR2: + return (&vmxctx->guest_dr2); + case VM_REG_GUEST_DR3: + return (&vmxctx->guest_dr3); + case VM_REG_GUEST_DR6: + return (&vmxctx->guest_dr6); + default: + break; + } + return (NULL); +} + +static int +vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *retval = *regp; + return (0); + } else + return (EINVAL); +} + +static int +vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *regp = val; + return (0); + } else + return (EINVAL); +} + +static int +vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) +{ + uint64_t gi; + int error; + + error = vmcs_getreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); + *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; + return (error); +} + +static int +vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) +{ + struct vmcs *vmcs; + uint64_t gi; + int error, ident; + + /* + * Forcing the vcpu into an interrupt shadow is not supported. + */ + if (val) { + error = EINVAL; + goto done; + } + + vmcs = &vmx->vmcs[vcpu]; + ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); + error = vmcs_getreg(vmcs, running, ident, &gi); + if (error == 0) { + gi &= ~HWINTR_BLOCKING; + error = vmcs_setreg(vmcs, running, ident, gi); + } +done: + VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, + error ? "failed" : "succeeded"); + return (error); +} + +static int +vmx_shadow_reg(int reg) +{ + int shreg; + + shreg = -1; + + switch (reg) { + case VM_REG_GUEST_CR0: + shreg = VMCS_CR0_SHADOW; + break; + case VM_REG_GUEST_CR4: + shreg = VMCS_CR4_SHADOW; + break; + default: + break; + } + + return (shreg); +} + +static int +vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) +{ + int running, hostcpu; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); + + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); + + if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) + return (0); + + return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); +} + +static int +vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + int error, hostcpu, running, shadow; + uint64_t ctls; + pmap_t pmap; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); + + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); + + if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) + return (0); + + error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); + + if (error == 0) { + /* + * If the "load EFER" VM-entry control is 1 then the + * value of EFER.LMA must be identical to "IA-32e mode guest" + * bit in the VM-entry control. + */ + if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && + (reg == VM_REG_GUEST_EFER)) { + vmcs_getreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); + if (val & EFER_LMA) + ctls |= VM_ENTRY_GUEST_LMA; + else + ctls &= ~VM_ENTRY_GUEST_LMA; + vmcs_setreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); + } + + shadow = vmx_shadow_reg(reg); + if (shadow > 0) { + /* + * Store the unmodified value in the shadow + */ + error = vmcs_setreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(shadow), val); + } + + if (reg == VM_REG_GUEST_CR3) { + /* + * Invalidate the guest vcpu's TLB mappings to emulate + * the behavior of updating %cr3. + * + * XXX the processor retains global mappings when %cr3 + * is updated but vmx_invvpid() does not. + */ + pmap = vmx->ctx[vcpu].pmap; + vmx_invvpid(vmx, vcpu, pmap, running); + } + } + + return (error); +} + +static int +vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + int hostcpu, running; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); +} + +static int +vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + int hostcpu, running; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); +} + +static int +vmx_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct vmx *vmx = arg; + int vcap; + int ret; + + ret = ENOENT; + + vcap = vmx->cap[vcpu].set; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) + ret = 0; + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) + ret = 0; + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) + ret = 0; + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) + ret = 0; + break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) + ret = 0; + break; + default: + break; + } + + if (ret == 0) + *retval = (vcap & (1 << type)) ? 1 : 0; + + return (ret); +} + +static int +vmx_setcap(void *arg, int vcpu, int type, int val) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + uint32_t baseval; + uint32_t *pptr; + int error; + int flag; + int reg; + int retval; + + retval = ENOENT; + pptr = NULL; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_HLT_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_MTF; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_PAUSE_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls2; + baseval = *pptr; + flag = PROCBASED2_UNRESTRICTED_GUEST; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls2; + baseval = *pptr; + flag = PROCBASED2_ENABLE_INVPCID; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + default: + break; + } + + if (retval == 0) { + if (val) { + baseval |= flag; + } else { + baseval &= ~flag; + } + VMPTRLD(vmcs); + error = vmwrite(reg, baseval); + VMCLEAR(vmcs); + + if (error) { + retval = error; + } else { + /* + * Update optional stored flags, and record + * setting + */ + if (pptr != NULL) { + *pptr = baseval; + } + + if (val) { + vmx->cap[vcpu].set |= (1 << type); + } else { + vmx->cap[vcpu].set &= ~(1 << type); + } + } + } + + return (retval); +} + +struct vlapic_vtx { + struct vlapic vlapic; + struct pir_desc *pir_desc; + struct vmx *vmx; + u_int pending_prio; +}; + +#define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) + +#define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ +do { \ + VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ + level ? "level" : "edge", vector); \ + VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ + VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ + VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ + VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ + VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ +} while (0) + +/* + * vlapic->ops handlers that utilize the APICv hardware assist described in + * Chapter 29 of the Intel SDM. + */ +static int +vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + uint64_t mask; + int idx, notify = 0; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + + /* + * Keep track of interrupt requests in the PIR descriptor. This is + * because the virtual APIC page pointed to by the VMCS cannot be + * modified if the vcpu is running. + */ + idx = vector / 64; + mask = 1UL << (vector % 64); + atomic_set_long(&pir_desc->pir[idx], mask); + + /* + * A notification is required whenever the 'pending' bit makes a + * transition from 0->1. + * + * Even if the 'pending' bit is already asserted, notification about + * the incoming interrupt may still be necessary. For example, if a + * vCPU is HLTed with a high PPR, a low priority interrupt would cause + * the 0->1 'pending' transition with a notification, but the vCPU + * would ignore the interrupt for the time being. The same vCPU would + * need to then be notified if a high-priority interrupt arrived which + * satisfied the PPR. + * + * The priorities of interrupts injected while 'pending' is asserted + * are tracked in a custom bitfield 'pending_prio'. Should the + * to-be-injected interrupt exceed the priorities already present, the + * notification is sent. The priorities recorded in 'pending_prio' are + * cleared whenever the 'pending' bit makes another 0->1 transition. + */ + if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { + notify = 1; + vlapic_vtx->pending_prio = 0; + } else { + const u_int old_prio = vlapic_vtx->pending_prio; + const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); + + if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { + atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); + notify = 1; + } + } + + VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, + level, "vmx_set_intr_ready"); + return (notify); +} + +static int +vmx_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint64_t pending, pirval; + uint32_t ppr, vpr; + int i; + + /* + * This function is only expected to be called from the 'HLT' exit + * handler which does not care about the vector that is pending. + */ + KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + + pending = atomic_load_acq_long(&pir_desc->pending); + if (!pending) { + /* + * While a virtual interrupt may have already been + * processed the actual delivery maybe pending the + * interruptibility of the guest. Recognize a pending + * interrupt by reevaluating virtual interrupts + * following Section 29.2.1 in the Intel SDM Volume 3. + */ + struct vm_exit *vmexit; + uint8_t rvi, ppr; + + vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); + rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; + lapic = vlapic->apic_page; + ppr = lapic->ppr & APIC_TPR_INT; + if (rvi > ppr) { + return (1); + } + + return (0); + } + + /* + * If there is an interrupt pending then it will be recognized only + * if its priority is greater than the processor priority. + * + * Special case: if the processor priority is zero then any pending + * interrupt will be recognized. + */ + lapic = vlapic->apic_page; + ppr = lapic->ppr & APIC_TPR_INT; + if (ppr == 0) + return (1); + + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", + lapic->ppr); + + vpr = 0; + for (i = 3; i >= 0; i--) { + pirval = pir_desc->pir[i]; + if (pirval != 0) { + vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; + break; + } + } + + /* + * If the highest-priority pending interrupt falls short of the + * processor priority of this vCPU, ensure that 'pending_prio' does not + * have any stale bits which would preclude a higher-priority interrupt + * from incurring a notification later. + */ + if (vpr <= ppr) { + const u_int prio_bit = VPR_PRIO_BIT(vpr); + const u_int old = vlapic_vtx->pending_prio; + + if (old > prio_bit && (old & prio_bit) == 0) { + vlapic_vtx->pending_prio = prio_bit; + } + return (0); + } + return (1); +} + +static void +vmx_intr_accepted(struct vlapic *vlapic, int vector) +{ + + panic("vmx_intr_accepted: not expected to be called"); +} + +static void +vmx_set_tmr(struct vlapic *vlapic, const uint32_t *masks) +{ + vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)masks[1] << 32) | masks[0]); + vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)masks[3] << 32) | masks[2]); + vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)masks[5] << 32) | masks[4]); + vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)masks[7] << 32) | masks[6]); +} + +static void +vmx_enable_x2apic_mode(struct vlapic *vlapic) +{ + struct vmx *vmx; + struct vmcs *vmcs; + uint32_t proc_ctls2; + int vcpuid, error; + + vcpuid = vlapic->vcpuid; + vmx = ((struct vlapic_vtx *)vlapic)->vmx; + vmcs = &vmx->vmcs[vcpuid]; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, + ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); + + proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; + proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; + vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; + + VMPTRLD(vmcs); + vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); + VMCLEAR(vmcs); + + if (vlapic->vcpuid == 0) { + /* + * The nested page table mappings are shared by all vcpus + * so unmap the APIC access page just once. + */ + error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); + KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", + __func__, error)); + + /* + * The MSR bitmap is shared by all vcpus so modify it only + * once in the context of vcpu 0. + */ + error = vmx_allow_x2apic_msrs(vmx); + KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", + __func__, error)); + } +} + +static void +vmx_post_intr(struct vlapic *vlapic, int hostcpu) +{ +#ifdef __FreeBSD__ + ipi_cpu(hostcpu, pirvec); +#else + psm_send_pir_ipi(hostcpu); +#endif +} + +/* + * Transfer the pending interrupts in the PIR descriptor to the IRR + * in the virtual APIC page. + */ +static void +vmx_inject_pir(struct vlapic *vlapic) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint64_t val, pirval; + int rvi, pirbase = -1; + uint16_t intr_status_old, intr_status_new; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " + "no posted interrupt pending"); + return; + } + + pirval = 0; + pirbase = -1; + lapic = vlapic->apic_page; + + val = atomic_readandclear_long(&pir_desc->pir[0]); + if (val != 0) { + lapic->irr0 |= val; + lapic->irr1 |= val >> 32; + pirbase = 0; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[1]); + if (val != 0) { + lapic->irr2 |= val; + lapic->irr3 |= val >> 32; + pirbase = 64; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[2]); + if (val != 0) { + lapic->irr4 |= val; + lapic->irr5 |= val >> 32; + pirbase = 128; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[3]); + if (val != 0) { + lapic->irr6 |= val; + lapic->irr7 |= val >> 32; + pirbase = 192; + pirval = val; + } + + VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); + + /* + * Update RVI so the processor can evaluate pending virtual + * interrupts on VM-entry. + * + * It is possible for pirval to be 0 here, even though the + * pending bit has been set. The scenario is: + * CPU-Y is sending a posted interrupt to CPU-X, which + * is running a guest and processing posted interrupts in h/w. + * CPU-X will eventually exit and the state seen in s/w is + * the pending bit set, but no PIR bits set. + * + * CPU-X CPU-Y + * (vm running) (host running) + * rx posted interrupt + * CLEAR pending bit + * SET PIR bit + * READ/CLEAR PIR bits + * SET pending bit + * (vm exit) + * pending bit set, PIR 0 + */ + if (pirval != 0) { + rvi = pirbase + flsl(pirval) - 1; + intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); + intr_status_new = (intr_status_old & 0xFF00) | rvi; + if (intr_status_new > intr_status_old) { + vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); + VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " + "guest_intr_status changed from 0x%04x to 0x%04x", + intr_status_old, intr_status_new); + } + } +} + +static struct vlapic * +vmx_vlapic_init(void *arg, int vcpuid) +{ + struct vmx *vmx; + struct vlapic *vlapic; + struct vlapic_vtx *vlapic_vtx; + + vmx = arg; + + vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = vmx->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; + vlapic_vtx->vmx = vmx; + + if (virtual_interrupt_delivery) { + vlapic->ops.set_intr_ready = vmx_set_intr_ready; + vlapic->ops.pending_intr = vmx_pending_intr; + vlapic->ops.intr_accepted = vmx_intr_accepted; + vlapic->ops.set_tmr = vmx_set_tmr; + vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; + } + + if (posted_interrupts) + vlapic->ops.post_intr = vmx_post_intr; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic, M_VLAPIC); +} + +#ifndef __FreeBSD__ +static void +vmx_savectx(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + + if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) { + VERIFY3U(vmclear(vmcs), ==, 0); + vmx_msr_guest_exit(vmx, vcpu); + /* + * Having VMCLEARed the VMCS, it can no longer be re-entered + * with VMRESUME, but must be VMLAUNCHed again. + */ + vmx->vmcs_state[vcpu] &= ~VS_LAUNCHED; + } + + reset_gdtr_limit(); +} + +static void +vmx_restorectx(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + + ASSERT0(vmx->vmcs_state[vcpu] & VS_LAUNCHED); + + if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) { + vmx_msr_guest_enter(vmx, vcpu); + VERIFY3U(vmptrld(vmcs), ==, 0); + } +} +#endif /* __FreeBSD__ */ + +struct vmm_ops vmm_ops_intel = { + vmx_init, + vmx_cleanup, + vmx_restore, + vmx_vminit, + vmx_run, + vmx_vmcleanup, + vmx_getreg, + vmx_setreg, + vmx_getdesc, + vmx_setdesc, + vmx_getcap, + vmx_setcap, + ept_vmspace_alloc, + ept_vmspace_free, + vmx_vlapic_init, + vmx_vlapic_cleanup, + +#ifndef __FreeBSD__ + vmx_savectx, + vmx_restorectx, +#endif +}; + +#ifndef __FreeBSD__ +/* Side-effect free HW validation derived from checks in vmx_init. */ +int +vmx_x86_supported(const char **msg) +{ + int error; + uint32_t tmp; + + ASSERT(msg != NULL); + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired primary " + "processor-based controls"; + return (error); + } + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired secondary " + "processor-based controls"; + return (error); + } + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired pin-based controls"; + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired exit controls"; + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired entry controls"; + return (error); + } + + /* Unrestricted guest is nominally optional, but not for us. */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp); + if (error) { + *msg = "processor does not support desired unrestricted guest " + "controls"; + return (error); + } + + return (0); +} +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h new file mode 100644 index 0000000000..2d16799bdd --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h @@ -0,0 +1,176 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VMX_H_ +#define _VMX_H_ + +#include "vmcs.h" + +struct pmap; + +struct vmxctx { + register_t guest_rdi; /* Guest state */ + register_t guest_rsi; + register_t guest_rdx; + register_t guest_rcx; + register_t guest_r8; + register_t guest_r9; + register_t guest_rax; + register_t guest_rbx; + register_t guest_rbp; + register_t guest_r10; + register_t guest_r11; + register_t guest_r12; + register_t guest_r13; + register_t guest_r14; + register_t guest_r15; + register_t guest_cr2; + register_t guest_dr0; + register_t guest_dr1; + register_t guest_dr2; + register_t guest_dr3; + register_t guest_dr6; + +#ifdef __FreeBSD__ + register_t host_r15; /* Host state */ + register_t host_r14; + register_t host_r13; + register_t host_r12; + register_t host_rbp; + register_t host_rsp; + register_t host_rbx; +#endif /* __FreeBSD__ */ + + register_t host_dr0; + register_t host_dr1; + register_t host_dr2; + register_t host_dr3; + register_t host_dr6; + register_t host_dr7; + uint64_t host_debugctl; + int host_tf; + + int inst_fail_status; + + /* + * The pmap needs to be deactivated in vmx_enter_guest() + * so keep a copy of the 'pmap' in each vmxctx. + */ + struct pmap *pmap; +}; + +struct vmxcap { + int set; + uint32_t proc_ctls; + uint32_t proc_ctls2; +}; + +struct vmxstate { + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that this 'vcpu' last ran on */ + uint16_t vpid; +}; + +struct apic_page { + uint32_t reg[PAGE_SIZE / 4]; +}; +CTASSERT(sizeof(struct apic_page) == PAGE_SIZE); + +/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */ +struct pir_desc { + uint64_t pir[4]; + uint64_t pending; + uint64_t unused[3]; +} __aligned(64); +CTASSERT(sizeof(struct pir_desc) == 64); + +/* Index into the 'guest_msrs[]' array */ +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + IDX_MSR_KGSBASE, + IDX_MSR_PAT, + GUEST_MSR_NUM /* must be the last enumeration */ +}; + +#ifndef __FreeBSD__ +typedef enum { + VS_NONE = 0x0, + VS_LAUNCHED = 0x1, + VS_LOADED = 0x2 +} vmcs_state_t; +#endif /* __FreeBSD__ */ + +/* virtual machine softc */ +struct vmx { + struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ + struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */ + char msr_bitmap[PAGE_SIZE]; + struct pir_desc pir_desc[VM_MAXCPU]; + uint64_t guest_msrs[VM_MAXCPU][GUEST_MSR_NUM]; +#ifndef __FreeBSD__ + uint64_t host_msrs[VM_MAXCPU][GUEST_MSR_NUM]; + uint64_t tsc_offset_active[VM_MAXCPU]; + vmcs_state_t vmcs_state[VM_MAXCPU]; +#endif + struct vmxctx ctx[VM_MAXCPU]; + struct vmxcap cap[VM_MAXCPU]; + struct vmxstate state[VM_MAXCPU]; + uint64_t eptp; + struct vm *vm; + long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ +}; +CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0); + +#define VMX_GUEST_VMEXIT 0 +#define VMX_VMRESUME_ERROR 1 +#define VMX_VMLAUNCH_ERROR 2 +#define VMX_INVEPT_ERROR 3 +#define VMX_VMWRITE_ERROR 4 +int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched); +void vmx_call_isr(uintptr_t entry); + +u_long vmx_fix_cr0(u_long cr0); +u_long vmx_fix_cr4(u_long cr4); + +int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset); + +extern char vmx_exit_guest[]; +extern char vmx_exit_guest_flush_rsb[]; + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h new file mode 100644 index 0000000000..5408d129ad --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h @@ -0,0 +1,98 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_CONTROLS_H_ +#define _VMX_CONTROLS_H_ + +/* Pin-Based VM-Execution Controls */ +#define PINBASED_EXTINT_EXITING (1 << 0) +#define PINBASED_NMI_EXITING (1 << 3) +#define PINBASED_VIRTUAL_NMI (1 << 5) +#define PINBASED_PREMPTION_TIMER (1 << 6) +#define PINBASED_POSTED_INTERRUPT (1 << 7) + +/* Primary Processor-Based VM-Execution Controls */ +#define PROCBASED_INT_WINDOW_EXITING (1 << 2) +#define PROCBASED_TSC_OFFSET (1 << 3) +#define PROCBASED_HLT_EXITING (1 << 7) +#define PROCBASED_INVLPG_EXITING (1 << 9) +#define PROCBASED_MWAIT_EXITING (1 << 10) +#define PROCBASED_RDPMC_EXITING (1 << 11) +#define PROCBASED_RDTSC_EXITING (1 << 12) +#define PROCBASED_CR3_LOAD_EXITING (1 << 15) +#define PROCBASED_CR3_STORE_EXITING (1 << 16) +#define PROCBASED_CR8_LOAD_EXITING (1 << 19) +#define PROCBASED_CR8_STORE_EXITING (1 << 20) +#define PROCBASED_USE_TPR_SHADOW (1 << 21) +#define PROCBASED_NMI_WINDOW_EXITING (1 << 22) +#define PROCBASED_MOV_DR_EXITING (1 << 23) +#define PROCBASED_IO_EXITING (1 << 24) +#define PROCBASED_IO_BITMAPS (1 << 25) +#define PROCBASED_MTF (1 << 27) +#define PROCBASED_MSR_BITMAPS (1 << 28) +#define PROCBASED_MONITOR_EXITING (1 << 29) +#define PROCBASED_PAUSE_EXITING (1 << 30) +#define PROCBASED_SECONDARY_CONTROLS (1U << 31) + +/* Secondary Processor-Based VM-Execution Controls */ +#define PROCBASED2_VIRTUALIZE_APIC_ACCESSES (1 << 0) +#define PROCBASED2_ENABLE_EPT (1 << 1) +#define PROCBASED2_DESC_TABLE_EXITING (1 << 2) +#define PROCBASED2_ENABLE_RDTSCP (1 << 3) +#define PROCBASED2_VIRTUALIZE_X2APIC_MODE (1 << 4) +#define PROCBASED2_ENABLE_VPID (1 << 5) +#define PROCBASED2_WBINVD_EXITING (1 << 6) +#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7) +#define PROCBASED2_APIC_REGISTER_VIRTUALIZATION (1 << 8) +#define PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY (1 << 9) +#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10) +#define PROCBASED2_ENABLE_INVPCID (1 << 12) + +/* VM Exit Controls */ +#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2) +#define VM_EXIT_HOST_LMA (1 << 9) +#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12) +#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15) +#define VM_EXIT_SAVE_PAT (1 << 18) +#define VM_EXIT_LOAD_PAT (1 << 19) +#define VM_EXIT_SAVE_EFER (1 << 20) +#define VM_EXIT_LOAD_EFER (1 << 21) +#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22) + +/* VM Entry Controls */ +#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2) +#define VM_ENTRY_GUEST_LMA (1 << 9) +#define VM_ENTRY_INTO_SMM (1 << 10) +#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11) +#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13) +#define VM_ENTRY_LOAD_PAT (1 << 14) +#define VM_ENTRY_LOAD_EFER (1 << 15) + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h new file mode 100644 index 0000000000..f0c5ba7691 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h @@ -0,0 +1,244 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VMX_CPUFUNC_H_ +#define _VMX_CPUFUNC_H_ + +struct vmcs; + +/* + * Section 5.2 "Conventions" from Intel Architecture Manual 2B. + * + * error + * VMsucceed 0 + * VMFailInvalid 1 + * VMFailValid 2 see also VMCS VM-Instruction Error Field + */ +#define VM_SUCCESS 0 +#define VM_FAIL_INVALID 1 +#define VM_FAIL_VALID 2 +#define VMX_SET_ERROR_CODE \ + " jnc 1f;" \ + " mov $1, %[error];" /* CF: error = 1 */ \ + " jmp 3f;" \ + "1: jnz 2f;" \ + " mov $2, %[error];" /* ZF: error = 2 */ \ + " jmp 3f;" \ + "2: mov $0, %[error];" \ + "3:" + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmxon(char *region) +{ + int error; + uint64_t addr; + +#ifdef __FreeBSD__ + addr = vtophys(region); +#else + /* This is pre-translated in illumos */ + addr = (uint64_t)region; +#endif + __asm __volatile("vmxon %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + + return (error); +} + +#ifdef __FreeBSD__ +/* returns 0 on success and non-zero on failure */ +static __inline int +vmclear(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmclear %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} +#endif /* __FreeBSD__ */ + +static __inline void +vmxoff(void) +{ + + __asm __volatile("vmxoff"); +} + +static __inline void +vmptrst(uint64_t *addr) +{ + + __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory"); +} + +#ifdef __FreeBSD__ +static __inline int +vmptrld(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmptrld %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} +#endif /* __FreeBSD__ */ + +static __inline int +vmwrite(uint64_t reg, uint64_t val) +{ + int error; + + __asm __volatile("vmwrite %[val], %[reg];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [val] "r" (val), [reg] "r" (reg) + : "memory"); + + return (error); +} + +static __inline int +vmread(uint64_t r, uint64_t *addr) +{ + int error; + + __asm __volatile("vmread %[r], %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [r] "r" (r), [addr] "m" (*addr) + : "memory"); + + return (error); +} + +#ifdef __FreeBSD__ +static __inline void +VMCLEAR(struct vmcs *vmcs) +{ + int err; + + err = vmclear(vmcs); + if (err != 0) + panic("%s: vmclear(%p) error %d", __func__, vmcs, err); + + critical_exit(); +} + +static __inline void +VMPTRLD(struct vmcs *vmcs) +{ + int err; + + critical_enter(); + + err = vmptrld(vmcs); + if (err != 0) + panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); +} +#endif /* __FreeBSD__ */ + +#define INVVPID_TYPE_ADDRESS 0UL +#define INVVPID_TYPE_SINGLE_CONTEXT 1UL +#define INVVPID_TYPE_ALL_CONTEXTS 2UL + +struct invvpid_desc { + uint16_t vpid; + uint16_t _res1; + uint32_t _res2; + uint64_t linear_addr; +}; +CTASSERT(sizeof(struct invvpid_desc) == 16); + +static __inline void +invvpid(uint64_t type, struct invvpid_desc desc) +{ + int error; + + __asm __volatile("invvpid %[desc], %[type];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error) + panic("invvpid error %d", error); +} + +#define INVEPT_TYPE_SINGLE_CONTEXT 1UL +#define INVEPT_TYPE_ALL_CONTEXTS 2UL +struct invept_desc { + uint64_t eptp; + uint64_t _res; +}; +CTASSERT(sizeof(struct invept_desc) == 16); + +static __inline void +invept(uint64_t type, struct invept_desc desc) +{ + int error; + + __asm __volatile("invept %[desc], %[type];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error) + panic("invept error %d", error); +} +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c new file mode 100644 index 0000000000..4a1a2cd358 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c @@ -0,0 +1,516 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <machine/clock.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> + +#include "vmx.h" +#include "vmx_msr.h" + +static boolean_t +vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) +{ + + if (msr_val & (1UL << (bitpos + 32))) + return (TRUE); + else + return (FALSE); +} + +static boolean_t +vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) +{ + + if ((msr_val & (1UL << bitpos)) == 0) + return (TRUE); + else + return (FALSE); +} + +uint32_t +vmx_revision(void) +{ + + return (rdmsr(MSR_VMX_BASIC) & 0xffffffff); +} + +/* + * Generate a bitmask to be used for the VMCS execution control fields. + * + * The caller specifies what bits should be set to one in 'ones_mask' + * and what bits should be set to zero in 'zeros_mask'. The don't-care + * bits are set to the default value. The default values are obtained + * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining + * VMX Capabilities". + * + * Returns zero on success and non-zero on error. + */ +int +vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval) +{ + int i; + uint64_t val, trueval; + boolean_t true_ctls_avail, one_allowed, zero_allowed; + + /* We cannot ask the same bit to be set to both '1' and '0' */ + if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) + return (EINVAL); + + if (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) + true_ctls_avail = TRUE; + else + true_ctls_avail = FALSE; + + val = rdmsr(ctl_reg); + if (true_ctls_avail) + trueval = rdmsr(true_ctl_reg); /* step c */ + else + trueval = val; /* step a */ + + for (i = 0; i < 32; i++) { + one_allowed = vmx_ctl_allows_one_setting(trueval, i); + zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); + + KASSERT(one_allowed || zero_allowed, + ("invalid zero/one setting for bit %d of ctl 0x%0x, " + "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); + + if (zero_allowed && !one_allowed) { /* b(i),c(i) */ + if (ones_mask & (1 << i)) + return (EINVAL); + *retval &= ~(1 << i); + } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ + if (zeros_mask & (1 << i)) + return (EINVAL); + *retval |= 1 << i; + } else { + if (zeros_mask & (1 << i)) /* b(ii),c(ii) */ + *retval &= ~(1 << i); + else if (ones_mask & (1 << i)) /* b(ii), c(ii) */ + *retval |= 1 << i; + else if (!true_ctls_avail) + *retval &= ~(1 << i); /* b(iii) */ + else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/ + *retval &= ~(1 << i); + else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */ + *retval |= 1 << i; + else { + panic("vmx_set_ctlreg: unable to determine " + "correct value of ctl bit %d for msr " + "0x%0x and true msr 0x%0x", i, ctl_reg, + true_ctl_reg); + } + } + } + + return (0); +} + +void +msr_bitmap_initialize(char *bitmap) +{ + + memset(bitmap, 0xff, PAGE_SIZE); +} + +int +msr_bitmap_change_access(char *bitmap, u_int msr, int access) +{ + int byte, bit; + + if (msr <= 0x00001FFF) + byte = msr / 8; + else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) + byte = 1024 + (msr - 0xC0000000) / 8; + else + return (EINVAL); + + bit = msr & 0x7; + + if (access & MSR_BITMAP_ACCESS_READ) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + byte += 2048; + if (access & MSR_BITMAP_ACCESS_WRITE) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + return (0); +} + +static uint64_t misc_enable; +static uint64_t platform_info; +static uint64_t turbo_ratio_limit; +#ifdef __FreeBSD__ +static uint64_t host_msrs[GUEST_MSR_NUM]; +#endif /* __FreeBSD__ */ + +static bool +nehalem_cpu(void) +{ + u_int family, model; + + /* + * The family:model numbers belonging to the Nehalem microarchitecture + * are documented in Section 35.5, Intel SDM dated Feb 2014. + */ + family = CPUID_TO_FAMILY(cpu_id); + model = CPUID_TO_MODEL(cpu_id); + if (family == 0x6) { + switch (model) { + case 0x1A: + case 0x1E: + case 0x1F: + case 0x2E: + return (true); + default: + break; + } + } + return (false); +} + +static bool +westmere_cpu(void) +{ + u_int family, model; + + /* + * The family:model numbers belonging to the Westmere microarchitecture + * are documented in Section 35.6, Intel SDM dated Feb 2014. + */ + family = CPUID_TO_FAMILY(cpu_id); + model = CPUID_TO_MODEL(cpu_id); + if (family == 0x6) { + switch (model) { + case 0x25: + case 0x2C: + return (true); + default: + break; + } + } + return (false); +} + +static bool +pat_valid(uint64_t val) +{ + int i, pa; + + /* + * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" + * + * Extract PA0 through PA7 and validate that each one encodes a + * valid memory type. + */ + for (i = 0; i < 8; i++) { + pa = (val >> (i * 8)) & 0xff; + if (pa == 2 || pa == 3 || pa >= 8) + return (false); + } + return (true); +} + +void +vmx_msr_init(void) +{ + uint64_t bus_freq, ratio; + int i; + +#ifdef __FreeBSD__ + /* XXXJOY: Do we want to do this caching? */ + /* + * It is safe to cache the values of the following MSRs because + * they don't change based on curcpu, curproc or curthread. + */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +#endif /* __FreeBSD__ */ + + /* + * Initialize emulated MSRs + */ + misc_enable = rdmsr(MSR_IA32_MISC_ENABLE); + /* + * Set mandatory bits + * 11: branch trace disabled + * 12: PEBS unavailable + * Clear unsupported features + * 16: SpeedStep enable + * 18: enable MONITOR FSM + */ + misc_enable |= (1 << 12) | (1 << 11); + misc_enable &= ~((1 << 18) | (1 << 16)); + + if (nehalem_cpu() || westmere_cpu()) + bus_freq = 133330000; /* 133Mhz */ + else + bus_freq = 100000000; /* 100Mhz */ + + /* + * XXXtime + * The ratio should really be based on the virtual TSC frequency as + * opposed to the host TSC. + */ + ratio = (tsc_freq / bus_freq) & 0xff; + + /* + * The register definition is based on the micro-architecture + * but the following bits are always the same: + * [15:8] Maximum Non-Turbo Ratio + * [28] Programmable Ratio Limit for Turbo Mode + * [29] Programmable TDC-TDP Limit for Turbo Mode + * [47:40] Maximum Efficiency Ratio + * + * The other bits can be safely set to 0 on all + * micro-architectures up to Haswell. + */ + platform_info = (ratio << 8) | (ratio << 40); + + /* + * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is + * dependent on the maximum cores per package supported by the micro- + * architecture. For e.g., Westmere supports 6 cores per package and + * uses the low 48 bits. Sandybridge support 8 cores per package and + * uses up all 64 bits. + * + * However, the unused bits are reserved so we pretend that all bits + * in this MSR are valid. + */ + for (i = 0; i < 8; i++) + turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; +} + +void +vmx_msr_guest_init(struct vmx *vmx, int vcpuid) +{ + uint64_t *guest_msrs; + + guest_msrs = vmx->guest_msrs[vcpuid]; + + /* + * The permissions bitmap is shared between all vcpus so initialize it + * once when initializing the vBSP. + */ + if (vcpuid == 0) { + guest_msr_rw(vmx, MSR_LSTAR); + guest_msr_rw(vmx, MSR_CSTAR); + guest_msr_rw(vmx, MSR_STAR); + guest_msr_rw(vmx, MSR_SF_MASK); + guest_msr_rw(vmx, MSR_KGSBASE); + } + + /* + * Initialize guest IA32_PAT MSR with default value after reset. + */ + guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + + return; +} + +void +vmx_msr_guest_enter(struct vmx *vmx, int vcpuid) +{ + uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; + +#ifndef __FreeBSD__ + uint64_t *host_msrs = vmx->host_msrs[vcpuid]; + + /* Save host MSRs */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +#endif /* __FreeBSD__ */ + + /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */ +#ifdef __FreeBSD__ + update_pcb_bases(curpcb); +#endif + wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]); + wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]); +} + +void +vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) +{ + uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; +#ifndef __FreeBSD__ + uint64_t *host_msrs = vmx->host_msrs[vcpuid]; +#endif + + /* Save guest MSRs */ + guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); + guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE); + + /* Restore host MSRs */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +} + +int +vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) +{ + const uint64_t *guest_msrs; + int error; + + guest_msrs = vmx->guest_msrs[vcpuid]; + error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + *val = 0; + break; + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + *val = 0; + break; + case MSR_IA32_MISC_ENABLE: + *val = misc_enable; + break; + case MSR_PLATFORM_INFO: + *val = platform_info; + break; + case MSR_TURBO_RATIO_LIMIT: + case MSR_TURBO_RATIO_LIMIT1: + *val = turbo_ratio_limit; + break; + case MSR_PAT: + *val = guest_msrs[IDX_MSR_PAT]; + break; + default: + error = EINVAL; + break; + } + return (error); +} + +int +vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) +{ + uint64_t *guest_msrs; + uint64_t changed; + int error; + + guest_msrs = vmx->guest_msrs[vcpuid]; + error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + break; /* ignore writes */ + case MSR_MTRRcap: + vm_inject_gp(vmx->vm, vcpuid); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + break; /* Ignore writes */ + case MSR_IA32_MISC_ENABLE: + changed = val ^ misc_enable; + /* + * If the host has disabled the NX feature then the guest + * also cannot use it. However, a Linux guest will try to + * enable the NX feature by writing to the MISC_ENABLE MSR. + * + * This can be safely ignored because the memory management + * code looks at CPUID.80000001H:EDX.NX to check if the + * functionality is actually enabled. + */ + changed &= ~(1UL << 34); + + /* + * Punt to userspace if any other bits are being modified. + */ + if (changed) + error = EINVAL; + + break; + case MSR_PAT: + if (pat_valid(val)) + guest_msrs[IDX_MSR_PAT] = val; + else + vm_inject_gp(vmx->vm, vcpuid); + break; +#ifdef __FreeBSD__ + case MSR_TSC: + error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc()); + break; +#endif /* __FreeBSD__ */ + default: + error = EINVAL; + break; + } + + return (error); +} diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h new file mode 100644 index 0000000000..ac2adb0dd1 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h @@ -0,0 +1,72 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_MSR_H_ +#define _VMX_MSR_H_ + +struct vmx; + +void vmx_msr_init(void); +void vmx_msr_guest_init(struct vmx *vmx, int vcpuid); +void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid); +void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid); +int vmx_rdmsr(struct vmx *, int vcpuid, u_int num, uint64_t *val, bool *retu); +int vmx_wrmsr(struct vmx *, int vcpuid, u_int num, uint64_t val, bool *retu); + +uint32_t vmx_revision(void); + +int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval); + +/* + * According to Section 21.10.4 "Software Access to Related Structures", + * changes to data structures pointed to by the VMCS must be made only when + * there is no logical processor with a current VMCS that points to the + * data structure. + * + * This pretty much limits us to configuring the MSR bitmap before VMCS + * initialization for SMP VMs. Unless of course we do it the hard way - which + * would involve some form of synchronization between the vcpus to vmclear + * all VMCSs' that point to the bitmap. + */ +#define MSR_BITMAP_ACCESS_NONE 0x0 +#define MSR_BITMAP_ACCESS_READ 0x1 +#define MSR_BITMAP_ACCESS_WRITE 0x2 +#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE) +void msr_bitmap_initialize(char *bitmap); +int msr_bitmap_change_access(char *bitmap, u_int msr, int access); + +#define guest_msr_rw(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) + +#define guest_msr_ro(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ) + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s new file mode 100644 index 0000000000..0130f88dd6 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s @@ -0,0 +1,384 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> +#include <sys/segments.h> + +/* Porting note: This is named 'vmx_support.S' upstream. */ + + + +#if defined(lint) + +struct vmxctx; +struct vmx; + +/*ARGSUSED*/ +void +vmx_launch(struct vmxctx *ctx) +{} + +void +vmx_exit_guest() +{} + +/*ARGSUSED*/ +int +vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched) +{ + return (0); +} + +#else /* lint */ + +#include "vmx_assym.h" +#include "vmcs.h" + +/* + * Assumes that %rdi holds a pointer to the 'vmxctx'. + * + * On "return" all registers are updated to reflect guest state. The two + * exceptions are %rip and %rsp. These registers are atomically switched + * by hardware from the guest area of the vmcs. + * + * We modify %rsp to point to the 'vmxctx' so we can use it to restore + * host context in case of an error with 'vmlaunch' or 'vmresume'. + */ +#define VMX_GUEST_RESTORE \ + movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ + movq %rsi,%cr2; \ + movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ + movq VMXCTX_GUEST_RDX(%rdi),%rdx; \ + movq VMXCTX_GUEST_RCX(%rdi),%rcx; \ + movq VMXCTX_GUEST_R8(%rdi),%r8; \ + movq VMXCTX_GUEST_R9(%rdi),%r9; \ + movq VMXCTX_GUEST_RAX(%rdi),%rax; \ + movq VMXCTX_GUEST_RBX(%rdi),%rbx; \ + movq VMXCTX_GUEST_RBP(%rdi),%rbp; \ + movq VMXCTX_GUEST_R10(%rdi),%r10; \ + movq VMXCTX_GUEST_R11(%rdi),%r11; \ + movq VMXCTX_GUEST_R12(%rdi),%r12; \ + movq VMXCTX_GUEST_R13(%rdi),%r13; \ + movq VMXCTX_GUEST_R14(%rdi),%r14; \ + movq VMXCTX_GUEST_R15(%rdi),%r15; \ + movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ + +#define VMX_GUEST_SAVE \ + movq %rdi, VMXSTK_TMPRDI(%rsp); \ + movq VMXSTK_RDI(%rsp), %rdi; \ + movq %rbp, VMXCTX_GUEST_RBP(%rdi); \ + leaq VMXSTK_FP(%rsp), %rbp; \ + movq %rsi, VMXCTX_GUEST_RSI(%rdi); \ + movq %rdx, VMXCTX_GUEST_RDX(%rdi); \ + movq %rcx, VMXCTX_GUEST_RCX(%rdi); \ + movq %r8, VMXCTX_GUEST_R8(%rdi); \ + movq %r9, VMXCTX_GUEST_R9(%rdi); \ + movq %rax, VMXCTX_GUEST_RAX(%rdi); \ + movq %rbx, VMXCTX_GUEST_RBX(%rdi); \ + movq %r10, VMXCTX_GUEST_R10(%rdi); \ + movq %r11, VMXCTX_GUEST_R11(%rdi); \ + movq %r12, VMXCTX_GUEST_R12(%rdi); \ + movq %r13, VMXCTX_GUEST_R13(%rdi); \ + movq %r14, VMXCTX_GUEST_R14(%rdi); \ + movq %r15, VMXCTX_GUEST_R15(%rdi); \ + movq %cr2, %rbx; \ + movq %rbx, VMXCTX_GUEST_CR2(%rdi); \ + movq VMXSTK_TMPRDI(%rsp), %rdx; \ + movq %rdx, VMXCTX_GUEST_RDI(%rdi); + + +/* + * Flush scratch registers to avoid lingering guest state being used for + * Spectre v1 attacks when returning from guest entry. + */ +#define VMX_GUEST_FLUSH_SCRATCH \ + xorl %edi, %edi; \ + xorl %esi, %esi; \ + xorl %edx, %edx; \ + xorl %ecx, %ecx; \ + xorl %r8d, %r8d; \ + xorl %r9d, %r9d; \ + xorl %r10d, %r10d; \ + xorl %r11d, %r11d; + + +/* Stack layout (offset from %rsp) for vmx_enter_guest */ +#define VMXSTK_TMPRDI 0x00 /* temp store %rdi on vmexit */ +#define VMXSTK_R15 0x08 /* callee saved %r15 */ +#define VMXSTK_R14 0x10 /* callee saved %r14 */ +#define VMXSTK_R13 0x18 /* callee saved %r13 */ +#define VMXSTK_R12 0x20 /* callee saved %r12 */ +#define VMXSTK_RBX 0x28 /* callee saved %rbx */ +#define VMXSTK_RDX 0x30 /* save-args %rdx (int launched) */ +#define VMXSTK_RSI 0x38 /* save-args %rsi (struct vmx *vmx) */ +#define VMXSTK_RDI 0x40 /* save-args %rdi (struct vmxctx *ctx) */ +#define VMXSTK_FP 0x48 /* frame pointer %rbp */ +#define VMXSTKSIZE VMXSTK_FP + +/* + * vmx_enter_guest(struct vmxctx *vmxctx, int launched) + * Interrupts must be disabled on entry. + */ +ENTRY_NP(vmx_enter_guest) + pushq %rbp + movq %rsp, %rbp + subq $VMXSTKSIZE, %rsp + movq %r15, VMXSTK_R15(%rsp) + movq %r14, VMXSTK_R14(%rsp) + movq %r13, VMXSTK_R13(%rsp) + movq %r12, VMXSTK_R12(%rsp) + movq %rbx, VMXSTK_RBX(%rsp) + movq %rdx, VMXSTK_RDX(%rsp) + movq %rsi, VMXSTK_RSI(%rsp) + movq %rdi, VMXSTK_RDI(%rsp) + + movq %rdi, %r12 /* vmxctx */ + movq %rsi, %r13 /* vmx */ + movl %edx, %r14d /* launch state */ + movq VMXCTX_PMAP(%rdi), %rbx + + /* Activate guest pmap on this cpu. */ + leaq PM_ACTIVE(%rbx), %rdi + movl %gs:CPU_ID, %esi + call cpuset_atomic_add + movq %r12, %rdi + + /* + * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen' + * then we must invalidate all mappings associated with this EPTP. + */ + movq PM_EPTGEN(%rbx), %r10 + movl %gs:CPU_ID, %eax + cmpq %r10, VMX_EPTGEN(%r13, %rax, 8) + je guest_restore + + /* Refresh 'vmx->eptgen[curcpu]' */ + movq %r10, VMX_EPTGEN(%r13, %rax, 8) + + /* Setup the invept descriptor on the host stack */ + pushq $0x0 + pushq VMX_EPTP(%r13) + movl $0x1, %eax /* Single context invalidate */ + invept (%rsp), %rax + leaq 0x10(%rsp), %rsp + jbe invept_error /* Check invept instruction error */ + +guest_restore: + /* Write the current %rsp into the VMCS to be restored on vmexit */ + movl $VMCS_HOST_RSP, %eax + vmwrite %rsp, %rax + jbe vmwrite_error + + /* Check if vmresume is adequate or a full vmlaunch is required */ + cmpl $0, %r14d + je do_launch + + VMX_GUEST_RESTORE + vmresume + /* + * In the common case, 'vmresume' returns back to the host through + * 'vmx_exit_guest'. If there is an error we return VMX_VMRESUME_ERROR + * to the caller. + */ + leaq VMXSTK_FP(%rsp), %rbp + movq VMXSTK_RDI(%rsp), %rdi + movl $VMX_VMRESUME_ERROR, %eax + jmp decode_inst_error + +do_launch: + VMX_GUEST_RESTORE + vmlaunch + /* + * In the common case, 'vmlaunch' returns back to the host through + * 'vmx_exit_guest'. If there is an error we return VMX_VMLAUNCH_ERROR + * to the caller. + */ + leaq VMXSTK_FP(%rsp), %rbp + movq VMXSTK_RDI(%rsp), %rdi + movl $VMX_VMLAUNCH_ERROR, %eax + jmp decode_inst_error + +vmwrite_error: + movl $VMX_VMWRITE_ERROR, %eax + jmp decode_inst_error +invept_error: + movl $VMX_INVEPT_ERROR, %eax + jmp decode_inst_error +decode_inst_error: + movl $VM_FAIL_VALID, %r11d + jz inst_error + movl $VM_FAIL_INVALID, %r11d +inst_error: + movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi) + + movq VMXCTX_PMAP(%rdi), %rdi + leaq PM_ACTIVE(%rdi), %rdi + movl %gs:CPU_ID, %esi + movq %rax, %r12 + call cpuset_atomic_del + movq %r12, %rax + + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 + + VMX_GUEST_FLUSH_SCRATCH + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret + +/* + * Non-error VM-exit from the guest. Make this a label so it can + * be used by C code when setting up the VMCS. + * The VMCS-restored %rsp points to the struct vmxctx + */ +.align ASM_ENTRY_ALIGN; +ALTENTRY(vmx_exit_guest) + /* Save guest state that is not automatically saved in the vmcs. */ + VMX_GUEST_SAVE + + /* Deactivate guest pmap on this cpu. */ + movq VMXCTX_PMAP(%rdi), %rdi + leaq PM_ACTIVE(%rdi), %rdi + movl %gs:CPU_ID, %esi + call cpuset_atomic_del + + /* + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. + */ + movl $VMX_GUEST_VMEXIT, %eax + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 + + VMX_GUEST_FLUSH_SCRATCH + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(vmx_enter_guest) + + + +.align ASM_ENTRY_ALIGN; +ALTENTRY(vmx_exit_guest_flush_rsb) + /* Save guest state that is not automatically saved in the vmcs. */ + VMX_GUEST_SAVE + + /* Deactivate guest pmap on this cpu. */ + movq VMXCTX_PMAP(%rdi), %rdi + leaq PM_ACTIVE(%rdi), %rdi + movl %gs:CPU_ID, %esi + call cpuset_atomic_del + + VMX_GUEST_FLUSH_SCRATCH + + /* + * To prevent malicious branch target predictions from affecting the + * host, overwrite all entries in the RSB upon exiting a guest. + */ + movl $16, %ecx /* 16 iterations, two calls per loop */ + movq %rsp, %rax +loop: + call 2f /* create an RSB entry. */ +1: + pause + call 1b /* capture rogue speculation. */ +2: + call 2f /* create an RSB entry. */ +1: + pause + call 1b /* capture rogue speculation. */ +2: + subl $1, %ecx + jnz loop + movq %rax, %rsp + + /* + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. + */ + movl $VMX_GUEST_VMEXIT, %eax + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(vmx_exit_guest_flush_rsb) + +/* + * %rdi = trapno + * + * We need to do enough to convince cmnint - and its iretting tail - that we're + * a legit interrupt stack frame. + */ +ENTRY_NP(vmx_call_isr) + pushq %rbp + movq %rsp, %rbp + movq %rsp, %r11 + andq $~0xf, %rsp /* align stack */ + pushq $KDS_SEL /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KCS_SEL /* %cs */ + leaq .iret_dest(%rip), %rcx + pushq %rcx /* %rip */ + pushq $0 /* err */ + pushq %rdi /* trapno */ + cli + jmp cmnint /* %rip (and call) */ +.iret_dest: + popq %rbp + ret +SET_SIZE(vmx_call_isr) + +#endif /* lint */ diff --git a/usr/src/uts/i86pc/io/vmm/intel/vtd.c b/usr/src/uts/i86pc/io/vmm/intel/vtd.c new file mode 100644 index 0000000000..902080e34c --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vtd.c @@ -0,0 +1,789 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <dev/pci/pcireg.h> + +#include <machine/vmparam.h> +#include <contrib/dev/acpica/include/acpi.h> + +#include <sys/sunndi.h> + +#include "io/iommu.h" + +/* + * Documented in the "Intel Virtualization Technology for Directed I/O", + * Architecture Spec, September 2008. + */ + +/* Section 10.4 "Register Descriptions" */ +struct vtdmap { + volatile uint32_t version; + volatile uint32_t res0; + volatile uint64_t cap; + volatile uint64_t ext_cap; + volatile uint32_t gcr; + volatile uint32_t gsr; + volatile uint64_t rta; + volatile uint64_t ccr; +}; + +#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) +#define VTD_CAP_ND(cap) ((cap) & 0x7) +#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) +#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) +#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) + +#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) +#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) +#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) + +#define VTD_GCR_WBF (1 << 27) +#define VTD_GCR_SRTP (1 << 30) +#define VTD_GCR_TE (1U << 31) + +#define VTD_GSR_WBFS (1 << 27) +#define VTD_GSR_RTPS (1 << 30) +#define VTD_GSR_TES (1U << 31) + +#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ +#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ + +#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ +#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ +#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ +#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ +#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ +#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ +#define VTD_IIR_DOMAIN_P 32 + +#define VTD_ROOT_PRESENT 0x1 +#define VTD_CTX_PRESENT 0x1 +#define VTD_CTX_TT_ALL (1UL << 2) + +#define VTD_PTE_RD (1UL << 0) +#define VTD_PTE_WR (1UL << 1) +#define VTD_PTE_SUPERPAGE (1UL << 7) +#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) + +#define VTD_RID2IDX(rid) (((rid) & 0xff) * 2) + +struct domain { + uint64_t *ptp; /* first level page table page */ + int pt_levels; /* number of page table levels */ + int addrwidth; /* 'AW' field in context entry */ + int spsmask; /* supported super page sizes */ + u_int id; /* domain id */ + vm_paddr_t maxaddr; /* highest address to be mapped */ + SLIST_ENTRY(domain) next; +}; + +static SLIST_HEAD(, domain) domhead; + +#define DRHD_MAX_UNITS 8 +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); +#ifndef __FreeBSD__ +static dev_info_t *vtddips[DRHD_MAX_UNITS]; +#endif + +static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); +static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); + +static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); + +static int +vtd_max_domains(struct vtdmap *vtdmap) +{ + int nd; + + nd = VTD_CAP_ND(vtdmap->cap); + + switch (nd) { + case 0: + return (16); + case 1: + return (64); + case 2: + return (256); + case 3: + return (1024); + case 4: + return (4 * 1024); + case 5: + return (16 * 1024); + case 6: + return (64 * 1024); + default: + panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); + } +} + +static u_int +domain_id(void) +{ + u_int id; + struct domain *dom; + + /* Skip domain id 0 - it is reserved when Caching Mode field is set */ + for (id = 1; id < max_domains; id++) { + SLIST_FOREACH(dom, &domhead, next) { + if (dom->id == id) + break; + } + if (dom == NULL) + break; /* found it */ + } + + if (id >= max_domains) + panic("domain ids exhausted"); + + return (id); +} + +static void +vtd_wbflush(struct vtdmap *vtdmap) +{ + + if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) + pmap_invalidate_cache(); + + if (VTD_CAP_RWBF(vtdmap->cap)) { + vtdmap->gcr = VTD_GCR_WBF; + while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) + ; + } +} + +static void +vtd_ctx_global_invalidate(struct vtdmap *vtdmap) +{ + + vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; + while ((vtdmap->ccr & VTD_CCR_ICC) != 0) + ; +} + +static void +vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) +{ + int offset; + volatile uint64_t *iotlb_reg, val; + + vtd_wbflush(vtdmap); + + offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; + iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); + + *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | + VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; + + while (1) { + val = *iotlb_reg; + if ((val & VTD_IIR_IVT) == 0) + break; + } +} + +static void +vtd_translation_enable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = VTD_GCR_TE; + while ((vtdmap->gsr & VTD_GSR_TES) == 0) + ; +} + +static void +vtd_translation_disable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = 0; + while ((vtdmap->gsr & VTD_GSR_TES) != 0) + ; +} + +static void * +vtd_map(dev_info_t *dip) +{ + caddr_t regs; + ddi_acc_handle_t hdl; + int error; + + static ddi_device_acc_attr_t regs_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC, + }; + + error = ddi_regs_map_setup(dip, 0, ®s, 0, PAGE_SIZE, ®s_attr, + &hdl); + + if (error != DDI_SUCCESS) + return (NULL); + + ddi_set_driver_private(dip, hdl); + + return (regs); +} + +static void +vtd_unmap(dev_info_t *dip) +{ + ddi_acc_handle_t hdl = ddi_get_driver_private(dip); + + if (hdl != NULL) + ddi_regs_map_free(&hdl); +} + +#ifndef __FreeBSD__ +/* + * This lives in vtd_sol.c for license reasons. + */ +extern dev_info_t *vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT *, int); +#endif + +static int +vtd_init(void) +{ + int i, units, remaining; + struct vtdmap *vtdmap; + vm_paddr_t ctx_paddr; + char *end; +#ifdef __FreeBSD__ + char envname[32]; + unsigned long mapaddr; +#endif + ACPI_STATUS status; + ACPI_TABLE_DMAR *dmar; + ACPI_DMAR_HEADER *hdr; + ACPI_DMAR_HARDWARE_UNIT *drhd; + +#ifdef __FreeBSD__ + /* + * Allow the user to override the ACPI DMAR table by specifying the + * physical address of each remapping unit. + * + * The following example specifies two remapping units at + * physical addresses 0xfed90000 and 0xfeda0000 respectively. + * set vtd.regmap.0.addr=0xfed90000 + * set vtd.regmap.1.addr=0xfeda0000 + */ + for (units = 0; units < DRHD_MAX_UNITS; units++) { + snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units); + if (getenv_ulong(envname, &mapaddr) == 0) + break; + vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr); + } + + if (units > 0) + goto skip_dmar; +#else + units = 0; +#endif + /* Search for DMAR table. */ + status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar); + if (ACPI_FAILURE(status)) + return (ENXIO); + + end = (char *)dmar + dmar->Header.Length; + remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR); + while (remaining > sizeof(ACPI_DMAR_HEADER)) { + hdr = (ACPI_DMAR_HEADER *)(end - remaining); + if (hdr->Length > remaining) + break; + /* + * From Intel VT-d arch spec, version 1.3: + * BIOS implementations must report mapping structures + * in numerical order, i.e. All remapping structures of + * type 0 (DRHD) enumerated before remapping structures of + * type 1 (RMRR) and so forth. + */ + if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT) + break; + + drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr; +#ifdef __FreeBSD__ + vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); +#else + vtddips[units] = vtd_get_dip(drhd, units); + vtdmaps[units] = (struct vtdmap *)vtd_map(vtddips[units]); + if (vtdmaps[units] == NULL) + goto fail; + units++; +#endif + if (units >= DRHD_MAX_UNITS) + break; + remaining -= hdr->Length; + } + + if (units <= 0) + return (ENXIO); + +#ifdef __FreeBSD__ +skip_dmar: +#endif + drhd_num = units; + vtdmap = vtdmaps[0]; + + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + + max_domains = vtd_max_domains(vtdmap); + + /* + * Set up the root-table to point to the context-entry tables + */ + for (i = 0; i < 256; i++) { + ctx_paddr = vtophys(ctx_tables[i]); + if (ctx_paddr & PAGE_MASK) + panic("ctx table (0x%0lx) not page aligned", ctx_paddr); + + root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; + } + + return (0); + +#ifndef __FreeBSD__ +fail: + for (i = 0; i <= units; i++) + vtd_unmap(vtddips[i]); + return (ENXIO); +#endif +} + +static void +vtd_cleanup(void) +{ +#ifndef __FreeBSD__ + int i; + + KASSERT(SLIST_EMPTY(&domhead), ("domain list not empty")); + + bzero(root_table, sizeof (root_table)); + + for (i = 0; i <= drhd_num; i++) { + vtdmaps[i] = NULL; + /* + * Unmap the vtd registers. Note that the devinfo nodes + * themselves aren't removed, they are considered system state + * and can be reused when the module is reloaded. + */ + if (vtddips[i] != NULL) + vtd_unmap(vtddips[i]); + } +#endif +} + +static void +vtd_enable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_wbflush(vtdmap); + + /* Update the root table address */ + vtdmap->rta = vtophys(root_table); + vtdmap->gcr = VTD_GCR_SRTP; + while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) + ; + + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + + vtd_translation_enable(vtdmap); + } +} + +static void +vtd_disable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_translation_disable(vtdmap); + } +} + +static void +vtd_add_device(void *arg, uint16_t rid) +{ + int idx; + uint64_t *ctxp; + struct domain *dom = arg; + vm_paddr_t pt_paddr; + struct vtdmap *vtdmap; + uint8_t bus; + + vtdmap = vtdmaps[0]; + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + pt_paddr = vtophys(dom->ptp); + idx = VTD_RID2IDX(rid); + + if (ctxp[idx] & VTD_CTX_PRESENT) { + panic("vtd_add_device: device %x is already owned by " + "domain %d", rid, + (uint16_t)(ctxp[idx + 1] >> 8)); + } + + /* + * Order is important. The 'present' bit is set only after all fields + * of the context pointer are initialized. + */ + ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); + + if (VTD_ECAP_DI(vtdmap->ext_cap)) + ctxp[idx] = VTD_CTX_TT_ALL; + else + ctxp[idx] = 0; + + ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; + + /* + * 'Not Present' entries are not cached in either the Context Cache + * or in the IOTLB, so there is no need to invalidate either of them. + */ +} + +static void +vtd_remove_device(void *arg, uint16_t rid) +{ + int i, idx; + uint64_t *ctxp; + struct vtdmap *vtdmap; + uint8_t bus; + + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + idx = VTD_RID2IDX(rid); + + /* + * Order is important. The 'present' bit is must be cleared first. + */ + ctxp[idx] = 0; + ctxp[idx + 1] = 0; + + /* + * Invalidate the Context Cache and the IOTLB. + * + * XXX use device-selective invalidation for Context Cache + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + } +} + +#define CREATE_MAPPING 0 +#define REMOVE_MAPPING 1 + +static uint64_t +vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, + int remove) +{ + struct domain *dom; + int i, spshift, ptpshift, ptpindex, nlevels; + uint64_t spsize, *ptp; + + dom = arg; + ptpindex = 0; + ptpshift = 0; + + KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__, + gpa, len)); + KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond " + "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr)); + + if (gpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); + + if (hpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); + + if (len & PAGE_MASK) + panic("vtd_create_mapping: unaligned len 0x%0lx", len); + + /* + * Compute the size of the mapping that we can accommodate. + * + * This is based on three factors: + * - supported super page size + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = 48; + for (i = 3; i >= 0; i--) { + spsize = 1UL << spshift; + if ((dom->spsmask & (1 << i)) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + (len >= spsize)) { + break; + } + spshift -= 9; + } + + ptp = dom->ptp; + nlevels = dom->pt_levels; + while (--nlevels >= 0) { + ptpshift = 12 + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) { + break; + } + + /* + * We are working on a non-leaf page table page. + * + * Create a downstream page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; + } + + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) + panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); + + /* + * Update the 'gpa' -> 'hpa' mapping + */ + if (remove) { + ptp[ptpindex] = 0; + } else { + ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; + + if (nlevels > 0) + ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + } + + return (1UL << ptpshift); +} + +static uint64_t +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING)); +} + +static uint64_t +vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING)); +} + +static void +vtd_invalidate_tlb(void *dom) +{ + int i; + struct vtdmap *vtdmap; + + /* + * Invalidate the IOTLB. + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_iotlb_global_invalidate(vtdmap); + } +} + +static void * +vtd_create_domain(vm_paddr_t maxaddr) +{ + struct domain *dom; + vm_paddr_t addr; + int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; + struct vtdmap *vtdmap; + + if (drhd_num <= 0) + panic("vtd_create_domain: no dma remapping hardware available"); + + vtdmap = vtdmaps[0]; + + /* + * Calculate AGAW. + * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. + */ + addr = 0; + for (gaw = 0; addr < maxaddr; gaw++) + addr = 1ULL << gaw; + + res = (gaw - 12) % 9; + if (res == 0) + agaw = gaw; + else + agaw = gaw + 9 - res; + + if (agaw > 64) + agaw = 64; + + /* + * Select the smallest Supported AGAW and the corresponding number + * of page table levels. + */ + pt_levels = 2; + sagaw = 30; + addrwidth = 0; + tmp = VTD_CAP_SAGAW(vtdmap->cap); + for (i = 0; i < 5; i++) { + if ((tmp & (1 << i)) != 0 && sagaw >= agaw) + break; + pt_levels++; + addrwidth++; + sagaw += 9; + if (sagaw > 64) + sagaw = 64; + } + + if (i >= 5) { + panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d", + VTD_CAP_SAGAW(vtdmap->cap), agaw); + } + + dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); + dom->pt_levels = pt_levels; + dom->addrwidth = addrwidth; + dom->id = domain_id(); + dom->maxaddr = maxaddr; + dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); + if ((uintptr_t)dom->ptp & PAGE_MASK) + panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); + +#ifdef __FreeBSD__ +#ifdef notyet + /* + * XXX superpage mappings for the iommu do not work correctly. + * + * By default all physical memory is mapped into the host_domain. + * When a VM is allocated wired memory the pages belonging to it + * are removed from the host_domain and added to the vm's domain. + * + * If the page being removed was mapped using a superpage mapping + * in the host_domain then we need to demote the mapping before + * removing the page. + * + * There is not any code to deal with the demotion at the moment + * so we disable superpage mappings altogether. + */ + dom->spsmask = VTD_CAP_SPS(vtdmap->cap); +#endif +#else + /* + * On illumos we decidedly do not remove memory mapped to a VM's domain + * from the host_domain, so we don't have to deal with page demotion and + * can just use large pages. + * + * Since VM memory is currently allocated as 4k pages and mapped into + * the VM domain page by page, the use of large pages is essentially + * limited to the host_domain. + */ + dom->spsmask = VTD_CAP_SPS(vtdmap->cap); +#endif + + SLIST_INSERT_HEAD(&domhead, dom, next); + + return (dom); +} + +static void +vtd_free_ptp(uint64_t *ptp, int level) +{ + int i; + uint64_t *nlp; + + if (level > 1) { + for (i = 0; i < 512; i++) { + if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) + continue; + if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) + continue; + nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); + vtd_free_ptp(nlp, level - 1); + } + } + + bzero(ptp, PAGE_SIZE); + free(ptp, M_VTD); +} + +static void +vtd_destroy_domain(void *arg) +{ + struct domain *dom; + + dom = arg; + + SLIST_REMOVE(&domhead, dom, domain, next); + vtd_free_ptp(dom->ptp, dom->pt_levels); + free(dom, M_VTD); +} + +struct iommu_ops iommu_ops_intel = { + vtd_init, + vtd_cleanup, + vtd_enable, + vtd_disable, + vtd_create_domain, + vtd_destroy_domain, + vtd_create_mapping, + vtd_remove_mapping, + vtd_add_device, + vtd_remove_device, + vtd_invalidate_tlb, +}; diff --git a/usr/src/uts/i86pc/io/vmm/intel/vtd_sol.c b/usr/src/uts/i86pc/io/vmm/intel/vtd_sol.c new file mode 100644 index 0000000000..1dbe8ffa48 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vtd_sol.c @@ -0,0 +1,83 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/sunndi.h> +#include <contrib/dev/acpica/include/acpi.h> + +dev_info_t * +vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT *drhd, int unit) +{ + dev_info_t *dip; + struct ddi_parent_private_data *pdptr; + struct regspec reg; + int circ; + + /* + * Try to find an existing devinfo node for this vtd unit. + */ + ndi_devi_enter(ddi_root_node(), &circ); + dip = ddi_find_devinfo("vtd", unit, 0); + ndi_devi_exit(ddi_root_node(), circ); + + if (dip != NULL) + return (dip); + + /* + * None found, construct a devinfo node for this vtd unit. + */ + dip = ddi_add_child(ddi_root_node(), "vtd", + DEVI_SID_NODEID, unit); + + reg.regspec_bustype = 0; + reg.regspec_addr = drhd->Address; + reg.regspec_size = PAGE_SIZE; + + /* + * update the reg properties + * + * reg property will be used for register + * set access + * + * refer to the bus_map of root nexus driver + * I/O or memory mapping: + * + * <bustype=0, addr=x, len=x>: memory + * <bustype=1, addr=x, len=x>: i/o + * <bustype>1, addr=0, len=x>: x86-compatibility i/o + */ + (void) ndi_prop_update_int_array(DDI_DEV_T_NONE, + dip, "reg", (int *)®, + sizeof (struct regspec) / sizeof (int)); + + /* + * This is an artificially constructed dev_info, and we + * need to set a few more things to be able to use it + * for ddi_dma_alloc_handle/free_handle. + */ + ddi_set_driver(dip, ddi_get_driver(ddi_root_node())); + DEVI(dip)->devi_bus_dma_allochdl = + DEVI(ddi_get_driver((ddi_root_node()))); + + pdptr = kmem_zalloc(sizeof (struct ddi_parent_private_data) + + sizeof (struct regspec), KM_SLEEP); + pdptr->par_nreg = 1; + pdptr->par_reg = (struct regspec *)(pdptr + 1); + pdptr->par_reg->regspec_bustype = 0; + pdptr->par_reg->regspec_addr = drhd->Address; + pdptr->par_reg->regspec_size = PAGE_SIZE; + ddi_set_parent_data(dip, pdptr); + + return (dip); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/iommu.c b/usr/src/uts/i86pc/io/vmm/io/iommu.c new file mode 100644 index 0000000000..b949573fe2 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/iommu.c @@ -0,0 +1,383 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/eventhandler.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/cpu.h> +#include <machine/md_var.h> + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "vmm_util.h" +#include "vmm_mem.h" +#include "iommu.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, iommu, CTLFLAG_RW, 0, "bhyve iommu parameters"); + +static int iommu_avail; +SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, initialized, CTLFLAG_RD, &iommu_avail, + 0, "bhyve iommu initialized?"); + +static int iommu_enable = 1; +SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, enable, CTLFLAG_RDTUN, &iommu_enable, 0, + "Enable use of I/O MMU (required for PCI passthrough)."); + +static struct iommu_ops *ops; +static void *host_domain; +#ifdef __FreeBSD__ +static eventhandler_tag add_tag, delete_tag; +#endif + +#ifndef __FreeBSD__ +static volatile u_int iommu_initted; +#endif + +static __inline int +IOMMU_INIT(void) +{ + if (ops != NULL) + return ((*ops->init)()); + else + return (ENXIO); +} + +static __inline void +IOMMU_CLEANUP(void) +{ + if (ops != NULL && iommu_avail) + (*ops->cleanup)(); +} + +static __inline void * +IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_domain)(maxaddr)); + else + return (NULL); +} + +static __inline void +IOMMU_DESTROY_DOMAIN(void *dom) +{ + + if (ops != NULL && iommu_avail) + (*ops->destroy_domain)(dom); +} + +static __inline uint64_t +IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_mapping)(domain, gpa, hpa, len)); + else + return (len); /* XXX */ +} + +static __inline uint64_t +IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->remove_mapping)(domain, gpa, len)); + else + return (len); /* XXX */ +} + +static __inline void +IOMMU_ADD_DEVICE(void *domain, uint16_t rid) +{ + + if (ops != NULL && iommu_avail) + (*ops->add_device)(domain, rid); +} + +static __inline void +IOMMU_REMOVE_DEVICE(void *domain, uint16_t rid) +{ + + if (ops != NULL && iommu_avail) + (*ops->remove_device)(domain, rid); +} + +static __inline void +IOMMU_INVALIDATE_TLB(void *domain) +{ + + if (ops != NULL && iommu_avail) + (*ops->invalidate_tlb)(domain); +} + +static __inline void +IOMMU_ENABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->enable)(); +} + +static __inline void +IOMMU_DISABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->disable)(); +} + +#ifdef __FreeBSD__ +static void +iommu_pci_add(void *arg, device_t dev) +{ + + /* Add new devices to the host domain. */ + iommu_add_device(host_domain, pci_get_rid(dev)); +} + +static void +iommu_pci_delete(void *arg, device_t dev) +{ + + iommu_remove_device(host_domain, pci_get_rid(dev)); +} +#endif + +#ifndef __FreeBSD__ +static int +iommu_find_device(dev_info_t *dip, void *arg) +{ + boolean_t add = (boolean_t)arg; + + if (pcie_is_pci_device(dip)) { + if (add) + iommu_add_device(host_domain, pci_get_rid(dip)); + else + iommu_remove_device(host_domain, pci_get_rid(dip)); + } + + return (DDI_WALK_CONTINUE); +} +#endif + +static void +iommu_init(void) +{ + int error, bus, slot, func; + vm_paddr_t maxaddr; +#ifdef __FreeBSD__ + devclass_t dc; +#endif + device_t dev; + + if (!iommu_enable) + return; + + if (vmm_is_intel()) + ops = &iommu_ops_intel; + else if (vmm_is_amd()) + ops = &iommu_ops_amd; + else + ops = NULL; + + error = IOMMU_INIT(); + if (error) + return; + + iommu_avail = 1; + + /* + * Create a domain for the devices owned by the host + */ + maxaddr = vmm_mem_maxaddr(); + host_domain = IOMMU_CREATE_DOMAIN(maxaddr); + if (host_domain == NULL) { + printf("iommu_init: unable to create a host domain"); + IOMMU_CLEANUP(); + ops = NULL; + iommu_avail = 0; + return; + } + + /* + * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to + * the host + */ + iommu_create_mapping(host_domain, 0, 0, maxaddr); + +#ifdef __FreeBSD__ + add_tag = EVENTHANDLER_REGISTER(pci_add_device, iommu_pci_add, NULL, 0); + delete_tag = EVENTHANDLER_REGISTER(pci_delete_device, iommu_pci_delete, + NULL, 0); + dc = devclass_find("ppt"); + for (bus = 0; bus <= PCI_BUSMAX; bus++) { + for (slot = 0; slot <= PCI_SLOTMAX; slot++) { + for (func = 0; func <= PCI_FUNCMAX; func++) { + dev = pci_find_dbsf(0, bus, slot, func); + if (dev == NULL) + continue; + + /* Skip passthrough devices. */ + if (dc != NULL && + device_get_devclass(dev) == dc) + continue; + + /* + * Everything else belongs to the host + * domain. + */ + iommu_add_device(host_domain, + pci_get_rid(dev)); + } + } + } +#else + ddi_walk_devs(ddi_root_node(), iommu_find_device, (void *)B_TRUE); +#endif + IOMMU_ENABLE(); + +} + +void +iommu_cleanup(void) +{ +#ifdef __FreeBSD__ + if (add_tag != NULL) { + EVENTHANDLER_DEREGISTER(pci_add_device, add_tag); + add_tag = NULL; + } + if (delete_tag != NULL) { + EVENTHANDLER_DEREGISTER(pci_delete_device, delete_tag); + delete_tag = NULL; + } +#else + atomic_store_rel_int(&iommu_initted, 0); +#endif + IOMMU_DISABLE(); +#ifndef __FreeBSD__ + ddi_walk_devs(ddi_root_node(), iommu_find_device, (void *)B_FALSE); +#endif + IOMMU_DESTROY_DOMAIN(host_domain); + IOMMU_CLEANUP(); +#ifndef __FreeBSD__ + ops = NULL; +#endif +} + +void * +iommu_create_domain(vm_paddr_t maxaddr) +{ + if (iommu_initted < 2) { + if (atomic_cmpset_int(&iommu_initted, 0, 1)) { + iommu_init(); + atomic_store_rel_int(&iommu_initted, 2); + } else + while (iommu_initted == 1) + cpu_spinwait(); + } + return (IOMMU_CREATE_DOMAIN(maxaddr)); +} + +void +iommu_destroy_domain(void *dom) +{ + + IOMMU_DESTROY_DOMAIN(dom); +} + +void +iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) +{ + uint64_t mapped, remaining; + + remaining = len; + + while (remaining > 0) { + mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining); + gpa += mapped; + hpa += mapped; + remaining -= mapped; + } +} + +void +iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len) +{ + uint64_t unmapped, remaining; + + remaining = len; + + while (remaining > 0) { + unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining); + gpa += unmapped; + remaining -= unmapped; + } +} + +void * +iommu_host_domain(void) +{ + + return (host_domain); +} + +void +iommu_add_device(void *dom, uint16_t rid) +{ + + IOMMU_ADD_DEVICE(dom, rid); +} + +void +iommu_remove_device(void *dom, uint16_t rid) +{ + + IOMMU_REMOVE_DEVICE(dom, rid); +} + +void +iommu_invalidate_tlb(void *domain) +{ + + IOMMU_INVALIDATE_TLB(domain); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/iommu.h b/usr/src/uts/i86pc/io/vmm/io/iommu.h new file mode 100644 index 0000000000..f8003a5d45 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/iommu.h @@ -0,0 +1,76 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_IOMMU_H_ +#define _IO_IOMMU_H_ + +typedef int (*iommu_init_func_t)(void); +typedef void (*iommu_cleanup_func_t)(void); +typedef void (*iommu_enable_func_t)(void); +typedef void (*iommu_disable_func_t)(void); +typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); +typedef void (*iommu_destroy_domain_t)(void *domain); +typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len); +typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa, + uint64_t len); +typedef void (*iommu_add_device_t)(void *domain, uint16_t rid); +typedef void (*iommu_remove_device_t)(void *dom, uint16_t rid); +typedef void (*iommu_invalidate_tlb_t)(void *dom); + +struct iommu_ops { + iommu_init_func_t init; /* module wide */ + iommu_cleanup_func_t cleanup; + iommu_enable_func_t enable; + iommu_disable_func_t disable; + + iommu_create_domain_t create_domain; /* domain-specific */ + iommu_destroy_domain_t destroy_domain; + iommu_create_mapping_t create_mapping; + iommu_remove_mapping_t remove_mapping; + iommu_add_device_t add_device; + iommu_remove_device_t remove_device; + iommu_invalidate_tlb_t invalidate_tlb; +}; + +extern struct iommu_ops iommu_ops_intel; +extern struct iommu_ops iommu_ops_amd; + +void iommu_cleanup(void); +void *iommu_host_domain(void); +void *iommu_create_domain(vm_paddr_t maxaddr); +void iommu_destroy_domain(void *dom); +void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, + size_t len); +void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len); +void iommu_add_device(void *dom, uint16_t rid); +void iommu_remove_device(void *dom, uint16_t rid); +void iommu_invalidate_tlb(void *domain); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.c b/usr/src/uts/i86pc/io/vmm/io/ppt.c new file mode 100644 index 0000000000..a71ce86c2d --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/ppt.c @@ -0,0 +1,1436 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/bus.h> +#include <sys/pciio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/stat.h> +#include <sys/sunddi.h> +#include <sys/pci.h> +#include <sys/pci_cap.h> +#include <sys/pcie_impl.h> +#include <sys/ppt_dev.h> +#include <sys/mkdev.h> +#include <sys/sysmacros.h> + +#include "vmm_lapic.h" +#include "vmm_ktr.h" + +#include "iommu.h" +#include "ppt.h" + +#define MAX_MSIMSGS 32 + +/* + * If the MSI-X table is located in the middle of a BAR then that MMIO + * region gets split into two segments - one segment above the MSI-X table + * and the other segment below the MSI-X table - with a hole in place of + * the MSI-X table so accesses to it can be trapped and emulated. + * + * So, allocate a MMIO segment for each BAR register + 1 additional segment. + */ +#define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1) + +struct pptintr_arg { + struct pptdev *pptdev; + uint64_t addr; + uint64_t msg_data; +}; + +struct pptseg { + vm_paddr_t gpa; + size_t len; + int wired; +}; + +struct pptbar { + uint64_t base; + uint64_t size; + uint_t type; + ddi_acc_handle_t io_handle; + caddr_t io_ptr; +}; + +struct pptdev { + dev_info_t *pptd_dip; + list_node_t pptd_node; + ddi_acc_handle_t pptd_cfg; + struct pptbar pptd_bars[PCI_BASE_NUM]; + struct vm *vm; + struct pptseg mmio[MAX_MMIOSEGS]; + struct { + int num_msgs; /* guest state */ + boolean_t is_fixed; + size_t inth_sz; + ddi_intr_handle_t *inth; + struct pptintr_arg arg[MAX_MSIMSGS]; + } msi; + + struct { + int num_msgs; + size_t inth_sz; + size_t arg_sz; + ddi_intr_handle_t *inth; + struct pptintr_arg *arg; + } msix; +}; + + +static major_t ppt_major; +static void *ppt_state; +static kmutex_t pptdev_mtx; +static list_t pptdev_list; + +#define PPT_MINOR_NAME "ppt" + +static ddi_device_acc_attr_t ppt_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STORECACHING_OK_ACC, + DDI_DEFAULT_ACC +}; + +static int +ppt_open(dev_t *devp, int flag, int otyp, cred_t *cr) +{ + /* XXX: require extra privs? */ + return (0); +} + +#define BAR_TO_IDX(bar) (((bar) - PCI_CONF_BASE0) / PCI_BAR_SZ_32) +#define BAR_VALID(b) ( \ + (b) >= PCI_CONF_BASE0 && \ + (b) <= PCI_CONF_BASE5 && \ + ((b) & (PCI_BAR_SZ_32-1)) == 0) + +static int +ppt_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + minor_t minor = getminor(dev); + struct pptdev *ppt; + void *data = (void *)arg; + + if ((ppt = ddi_get_soft_state(ppt_state, minor)) == NULL) { + return (ENOENT); + } + + switch (cmd) { + case PPT_CFG_READ: { + struct ppt_cfg_io cio; + ddi_acc_handle_t cfg = ppt->pptd_cfg; + + if (ddi_copyin(data, &cio, sizeof (cio), md) != 0) { + return (EFAULT); + } + switch (cio.pci_width) { + case 4: + cio.pci_data = pci_config_get32(cfg, cio.pci_off); + break; + case 2: + cio.pci_data = pci_config_get16(cfg, cio.pci_off); + break; + case 1: + cio.pci_data = pci_config_get8(cfg, cio.pci_off); + break; + default: + return (EINVAL); + } + + if (ddi_copyout(&cio, data, sizeof (cio), md) != 0) { + return (EFAULT); + } + return (0); + } + case PPT_CFG_WRITE: { + struct ppt_cfg_io cio; + ddi_acc_handle_t cfg = ppt->pptd_cfg; + + if (ddi_copyin(data, &cio, sizeof (cio), md) != 0) { + return (EFAULT); + } + switch (cio.pci_width) { + case 4: + pci_config_put32(cfg, cio.pci_off, cio.pci_data); + break; + case 2: + pci_config_put16(cfg, cio.pci_off, cio.pci_data); + break; + case 1: + pci_config_put8(cfg, cio.pci_off, cio.pci_data); + break; + default: + return (EINVAL); + } + + return (0); + } + case PPT_BAR_QUERY: { + struct ppt_bar_query barg; + struct pptbar *pbar; + + if (ddi_copyin(data, &barg, sizeof (barg), md) != 0) { + return (EFAULT); + } + if (barg.pbq_baridx >= PCI_BASE_NUM) { + return (EINVAL); + } + pbar = &ppt->pptd_bars[barg.pbq_baridx]; + + if (pbar->base == 0 || pbar->size == 0) { + return (ENOENT); + } + barg.pbq_type = pbar->type; + barg.pbq_base = pbar->base; + barg.pbq_size = pbar->size; + + if (ddi_copyout(&barg, data, sizeof (barg), md) != 0) { + return (EFAULT); + } + return (0); + } + case PPT_BAR_READ: { + struct ppt_bar_io bio; + struct pptbar *pbar; + void *addr; + uint_t rnum; + ddi_acc_handle_t cfg; + + if (ddi_copyin(data, &bio, sizeof (bio), md) != 0) { + return (EFAULT); + } + rnum = bio.pbi_bar; + if (rnum >= PCI_BASE_NUM) { + return (EINVAL); + } + pbar = &ppt->pptd_bars[rnum]; + if (pbar->type != PCI_ADDR_IO || pbar->io_handle == NULL) { + return (EINVAL); + } + addr = pbar->io_ptr + bio.pbi_off; + + switch (bio.pbi_width) { + case 4: + bio.pbi_data = ddi_get32(pbar->io_handle, addr); + break; + case 2: + bio.pbi_data = ddi_get16(pbar->io_handle, addr); + break; + case 1: + bio.pbi_data = ddi_get8(pbar->io_handle, addr); + break; + default: + return (EINVAL); + } + + if (ddi_copyout(&bio, data, sizeof (bio), md) != 0) { + return (EFAULT); + } + return (0); + } + case PPT_BAR_WRITE: { + struct ppt_bar_io bio; + struct pptbar *pbar; + void *addr; + uint_t rnum; + ddi_acc_handle_t cfg; + + if (ddi_copyin(data, &bio, sizeof (bio), md) != 0) { + return (EFAULT); + } + rnum = bio.pbi_bar; + if (rnum >= PCI_BASE_NUM) { + return (EINVAL); + } + pbar = &ppt->pptd_bars[rnum]; + if (pbar->type != PCI_ADDR_IO || pbar->io_handle == NULL) { + return (EINVAL); + } + addr = pbar->io_ptr + bio.pbi_off; + + switch (bio.pbi_width) { + case 4: + ddi_put32(pbar->io_handle, addr, bio.pbi_data); + break; + case 2: + ddi_put16(pbar->io_handle, addr, bio.pbi_data); + break; + case 1: + ddi_put8(pbar->io_handle, addr, bio.pbi_data); + break; + default: + return (EINVAL); + } + + return (0); + } + + default: + return (ENOTTY); + } + + return (0); +} + +static int +ppt_find_pba_bar(struct pptdev *ppt) +{ + uint16_t base; + uint32_t pba_off; + + if (PCI_CAP_LOCATE(ppt->pptd_cfg, PCI_CAP_ID_MSI_X, &base) != + DDI_SUCCESS) + return (-1); + + pba_off = pci_config_get32(ppt->pptd_cfg, base + PCI_MSIX_PBA_OFFSET); + + if (pba_off == PCI_EINVAL32) + return (-1); + + return (pba_off & PCI_MSIX_PBA_BIR_MASK); +} + +static int +ppt_devmap(dev_t dev, devmap_cookie_t dhp, offset_t off, size_t len, + size_t *maplen, uint_t model) +{ + minor_t minor; + struct pptdev *ppt; + int err; + int bar; + + minor = getminor(dev); + + if ((ppt = ddi_get_soft_state(ppt_state, minor)) == NULL) + return (ENXIO); + +#ifdef _MULTI_DATAMODEL + if (ddi_model_convert_from(model) != DDI_MODEL_NONE) + return (ENXIO); +#endif + + if (off < 0 || off != P2ALIGN(off, PAGESIZE)) + return (EINVAL); + + if ((bar = ppt_find_pba_bar(ppt)) == -1) + return (EINVAL); + + /* + * Add 1 to the BAR number to get the register number used by DDI. + * Register 0 corresponds to PCI config space, the PCI BARs start at 1. + */ + bar += 1; + + err = devmap_devmem_setup(dhp, ppt->pptd_dip, NULL, bar, off, len, + PROT_USER | PROT_READ | PROT_WRITE, IOMEM_DATA_CACHED, &ppt_attr); + + if (err == DDI_SUCCESS) + *maplen = len; + + return (err); +} + + +static void +ppt_bar_wipe(struct pptdev *ppt) +{ + uint_t i; + + for (i = 0; i < PCI_BASE_NUM; i++) { + struct pptbar *pbar = &ppt->pptd_bars[i]; + if (pbar->type == PCI_ADDR_IO && pbar->io_handle != NULL) { + ddi_regs_map_free(&pbar->io_handle); + } + } + bzero(&ppt->pptd_bars, sizeof (ppt->pptd_bars)); +} + +static int +ppt_bar_crawl(struct pptdev *ppt) +{ + pci_regspec_t *regs; + uint_t rcount, i; + int err = 0, rlen; + + if (ddi_getlongprop(DDI_DEV_T_ANY, ppt->pptd_dip, DDI_PROP_DONTPASS, + "assigned-addresses", (caddr_t)®s, &rlen) != DDI_PROP_SUCCESS) { + return (EIO); + } + + VERIFY3S(rlen, >, 0); + rcount = rlen / sizeof (pci_regspec_t); + for (i = 0; i < rcount; i++) { + pci_regspec_t *reg = ®s[i]; + struct pptbar *pbar; + uint_t bar, rnum; + + DTRACE_PROBE1(ppt__crawl__reg, pci_regspec_t *, reg); + bar = PCI_REG_REG_G(reg->pci_phys_hi); + if (!BAR_VALID(bar)) { + continue; + } + + rnum = BAR_TO_IDX(bar); + pbar = &ppt->pptd_bars[rnum]; + /* is this somehow already populated? */ + if (pbar->base != 0 || pbar->size != 0) { + err = EEXIST; + break; + } + + pbar->type = reg->pci_phys_hi & PCI_ADDR_MASK; + pbar->base = ((uint64_t)reg->pci_phys_mid << 32) | + (uint64_t)reg->pci_phys_low; + pbar->size = ((uint64_t)reg->pci_size_hi << 32) | + (uint64_t)reg->pci_size_low; + if (pbar->type == PCI_ADDR_IO) { + err = ddi_regs_map_setup(ppt->pptd_dip, rnum, + &pbar->io_ptr, 0, 0, &ppt_attr, &pbar->io_handle); + if (err != 0) { + break; + } + } + } + kmem_free(regs, rlen); + + if (err != 0) { + ppt_bar_wipe(ppt); + } + return (err); +} + +static boolean_t +ppt_bar_verify_mmio(struct pptdev *ppt, uint64_t base, uint64_t size) +{ + const uint64_t map_end = base + size; + + /* Zero-length or overflow mappings are not valid */ + if (map_end <= base) { + return (B_FALSE); + } + /* MMIO bounds should be page-aligned */ + if ((base & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { + return (B_FALSE); + } + + for (uint_t i = 0; i < PCI_BASE_NUM; i++) { + const struct pptbar *bar = &ppt->pptd_bars[i]; + const uint64_t bar_end = bar->base + bar->size; + + /* Only memory BARs can be mapped */ + if (bar->type != PCI_ADDR_MEM32 && + bar->type != PCI_ADDR_MEM64) { + continue; + } + + /* Does the mapping fit within this BAR? */ + if (base < bar->base || base >= bar_end || + map_end < bar->base || map_end > bar_end) { + continue; + } + + /* This BAR satisfies the provided map */ + return (B_TRUE); + } + return (B_FALSE); +} + +static int +ppt_ddi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + struct pptdev *ppt = NULL; + char name[PPT_MAXNAMELEN]; + int inst; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + inst = ddi_get_instance(dip); + + if (ddi_soft_state_zalloc(ppt_state, inst) != DDI_SUCCESS) { + goto fail; + } + VERIFY(ppt = ddi_get_soft_state(ppt_state, inst)); + ppt->pptd_dip = dip; + ddi_set_driver_private(dip, ppt); + + if (pci_config_setup(dip, &ppt->pptd_cfg) != DDI_SUCCESS) { + goto fail; + } + if (ppt_bar_crawl(ppt) != 0) { + goto fail; + } + if (ddi_create_minor_node(dip, PPT_MINOR_NAME, S_IFCHR, inst, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + goto fail; + } + + mutex_enter(&pptdev_mtx); + list_insert_tail(&pptdev_list, ppt); + mutex_exit(&pptdev_mtx); + + return (DDI_SUCCESS); + +fail: + if (ppt != NULL) { + ddi_remove_minor_node(dip, NULL); + if (ppt->pptd_cfg != NULL) { + pci_config_teardown(&ppt->pptd_cfg); + } + ppt_bar_wipe(ppt); + ddi_soft_state_free(ppt_state, inst); + } + return (DDI_FAILURE); +} + +static int +ppt_ddi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + struct pptdev *ppt; + int inst; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ppt = ddi_get_driver_private(dip); + inst = ddi_get_instance(dip); + + ASSERT3P(ddi_get_soft_state(ppt_state, inst), ==, ppt); + + mutex_enter(&pptdev_mtx); + if (ppt->vm != NULL) { + mutex_exit(&pptdev_mtx); + return (DDI_FAILURE); + } + list_remove(&pptdev_list, ppt); + mutex_exit(&pptdev_mtx); + + ddi_remove_minor_node(dip, PPT_MINOR_NAME); + ppt_bar_wipe(ppt); + pci_config_teardown(&ppt->pptd_cfg); + ddi_set_driver_private(dip, NULL); + ddi_soft_state_free(ppt_state, inst); + + return (DDI_SUCCESS); +} + +static int +ppt_ddi_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error = DDI_FAILURE; + int inst = getminor((dev_t)arg); + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: { + struct pptdev *ppt = ddi_get_soft_state(ppt_state, inst); + + if (ppt != NULL) { + *result = (void *)ppt->pptd_dip; + error = DDI_SUCCESS; + } + break; + } + case DDI_INFO_DEVT2INSTANCE: { + *result = (void *)(uintptr_t)inst; + error = DDI_SUCCESS; + break; + } + default: + break; + } + return (error); +} + +static struct cb_ops ppt_cb_ops = { + ppt_open, + nulldev, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + ppt_ioctl, + ppt_devmap, /* devmap */ + NULL, /* mmap */ + NULL, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, + NULL, + D_NEW | D_MP | D_64BIT | D_DEVMAP, + CB_REV +}; + +static struct dev_ops ppt_ops = { + DEVO_REV, + 0, + ppt_ddi_info, + nulldev, /* identify */ + nulldev, /* probe */ + ppt_ddi_attach, + ppt_ddi_detach, + nodev, /* reset */ + &ppt_cb_ops, + (struct bus_ops *)NULL +}; + +static struct modldrv modldrv = { + &mod_driverops, + "bhyve pci pass-thru", + &ppt_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + int error; + + mutex_init(&pptdev_mtx, NULL, MUTEX_DRIVER, NULL); + list_create(&pptdev_list, sizeof (struct pptdev), + offsetof(struct pptdev, pptd_node)); + + error = ddi_soft_state_init(&ppt_state, sizeof (struct pptdev), 0); + if (error) { + goto fail; + } + + error = mod_install(&modlinkage); + + ppt_major = ddi_name_to_major("ppt"); +fail: + if (error) { + ddi_soft_state_fini(&ppt_state); + } + return (error); +} + +int +_fini(void) +{ + int error; + + error = mod_remove(&modlinkage); + if (error) + return (error); + ddi_soft_state_fini(&ppt_state); + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static boolean_t +ppt_wait_for_pending_txn(dev_info_t *dip, uint_t max_delay_us) +{ + uint16_t cap_ptr, devsts; + ddi_acc_handle_t hdl; + + if (pci_config_setup(dip, &hdl) != DDI_SUCCESS) + return (B_FALSE); + + if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS) { + pci_config_teardown(&hdl); + return (B_FALSE); + } + + devsts = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVSTS); + while ((devsts & PCIE_DEVSTS_TRANS_PENDING) != 0) { + if (max_delay_us == 0) { + pci_config_teardown(&hdl); + return (B_FALSE); + } + + /* Poll once every 100 milliseconds up to the timeout. */ + if (max_delay_us > 100000) { + delay(drv_usectohz(100000)); + max_delay_us -= 100000; + } else { + delay(drv_usectohz(max_delay_us)); + max_delay_us = 0; + } + devsts = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVSTS); + } + + pci_config_teardown(&hdl); + return (B_TRUE); +} + +static uint_t +ppt_max_completion_tmo_us(dev_info_t *dip) +{ + uint_t timo = 0; + uint16_t cap_ptr; + ddi_acc_handle_t hdl; + uint_t timo_ranges[] = { /* timeout ranges */ + 50000, /* 50ms */ + 100, /* 100us */ + 10000, /* 10ms */ + 0, + 0, + 55000, /* 55ms */ + 210000, /* 210ms */ + 0, + 0, + 900000, /* 900ms */ + 3500000, /* 3.5s */ + 0, + 0, + 13000000, /* 13s */ + 64000000, /* 64s */ + 0 + }; + + if (pci_config_setup(dip, &hdl) != DDI_SUCCESS) + return (50000); /* default 50ms */ + + if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS) + goto out; + + if ((PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_PCIECAP) & + PCIE_PCIECAP_VER_MASK) < PCIE_PCIECAP_VER_2_0) + goto out; + + if ((PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCAP2) & + PCIE_DEVCTL2_COM_TO_RANGE_MASK) == 0) + goto out; + + timo = timo_ranges[PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCTL2) & + PCIE_DEVCAP2_COM_TO_RANGE_MASK]; + +out: + if (timo == 0) + timo = 50000; /* default 50ms */ + + pci_config_teardown(&hdl); + return (timo); +} + +static boolean_t +ppt_flr(dev_info_t *dip, boolean_t force) +{ + uint16_t cap_ptr, ctl, cmd; + ddi_acc_handle_t hdl; + uint_t compl_delay = 0, max_delay_us; + + if (pci_config_setup(dip, &hdl) != DDI_SUCCESS) + return (B_FALSE); + + if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS) + goto fail; + + if ((PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCAP) & PCIE_DEVCAP_FLR) + == 0) + goto fail; + + max_delay_us = MAX(ppt_max_completion_tmo_us(dip), 10000); + + /* + * Disable busmastering to prevent generation of new transactions while + * waiting for the device to go idle. If the idle timeout fails, the + * command register is restored which will re-enable busmastering. + */ + cmd = pci_config_get16(hdl, PCI_CONF_COMM); + pci_config_put16(hdl, PCI_CONF_COMM, cmd & ~PCI_COMM_ME); + if (!ppt_wait_for_pending_txn(dip, max_delay_us)) { + if (!force) { + pci_config_put16(hdl, PCI_CONF_COMM, cmd); + goto fail; + } + dev_err(dip, CE_WARN, + "?Resetting with transactions pending after %u us\n", + max_delay_us); + + /* + * Extend the post-FLR delay to cover the maximum Completion + * Timeout delay of anything in flight during the FLR delay. + * Enforce a minimum delay of at least 10ms. + */ + compl_delay = MAX(10, (ppt_max_completion_tmo_us(dip) / 1000)); + } + + /* Initiate the reset. */ + ctl = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCTL); + (void) PCI_CAP_PUT16(hdl, 0, cap_ptr, PCIE_DEVCTL, + ctl | PCIE_DEVCTL_INITIATE_FLR); + + /* Wait for at least 100ms */ + delay(drv_usectohz((100 + compl_delay) * 1000)); + + pci_config_teardown(&hdl); + return (B_TRUE); + +fail: + /* + * TODO: If the FLR fails for some reason, we should attempt a reset + * using the PCI power management facilities (if possible). + */ + pci_config_teardown(&hdl); + return (B_FALSE); +} + + +static struct pptdev * +ppt_findf(int fd) +{ + struct pptdev *ppt = NULL; + file_t *fp; + vattr_t va; + + if ((fp = getf(fd)) == NULL) { + return (NULL); + } + + va.va_mask = AT_RDEV; + if (VOP_GETATTR(fp->f_vnode, &va, NO_FOLLOW, fp->f_cred, NULL) != 0 || + getmajor(va.va_rdev) != ppt_major) + goto fail; + + ppt = ddi_get_soft_state(ppt_state, getminor(va.va_rdev)); + + if (ppt != NULL) + return (ppt); + +fail: + releasef(fd); + return (NULL); +} + +static void +ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt) +{ + int i; + struct pptseg *seg; + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) + continue; + (void) vm_unmap_mmio(vm, seg->gpa, seg->len); + bzero(seg, sizeof (struct pptseg)); + } +} + +static void +ppt_teardown_msi(struct pptdev *ppt) +{ + int i; + + if (ppt->msi.num_msgs == 0) + return; + + for (i = 0; i < ppt->msi.num_msgs; i++) { + int intr_cap; + + (void) ddi_intr_get_cap(ppt->msi.inth[i], &intr_cap); + if (intr_cap & DDI_INTR_FLAG_BLOCK) + ddi_intr_block_disable(&ppt->msi.inth[i], 1); + else + ddi_intr_disable(ppt->msi.inth[i]); + + ddi_intr_remove_handler(ppt->msi.inth[i]); + ddi_intr_free(ppt->msi.inth[i]); + + ppt->msi.inth[i] = NULL; + } + + kmem_free(ppt->msi.inth, ppt->msi.inth_sz); + ppt->msi.inth = NULL; + ppt->msi.inth_sz = 0; + ppt->msi.is_fixed = B_FALSE; + + ppt->msi.num_msgs = 0; +} + +static void +ppt_teardown_msix_intr(struct pptdev *ppt, int idx) +{ + if (ppt->msix.inth != NULL && ppt->msix.inth[idx] != NULL) { + int intr_cap; + + (void) ddi_intr_get_cap(ppt->msix.inth[idx], &intr_cap); + if (intr_cap & DDI_INTR_FLAG_BLOCK) + ddi_intr_block_disable(&ppt->msix.inth[idx], 1); + else + ddi_intr_disable(ppt->msix.inth[idx]); + + ddi_intr_remove_handler(ppt->msix.inth[idx]); + } +} + +static void +ppt_teardown_msix(struct pptdev *ppt) +{ + uint_t i; + + if (ppt->msix.num_msgs == 0) + return; + + for (i = 0; i < ppt->msix.num_msgs; i++) + ppt_teardown_msix_intr(ppt, i); + + if (ppt->msix.inth) { + for (i = 0; i < ppt->msix.num_msgs; i++) + ddi_intr_free(ppt->msix.inth[i]); + kmem_free(ppt->msix.inth, ppt->msix.inth_sz); + ppt->msix.inth = NULL; + ppt->msix.inth_sz = 0; + kmem_free(ppt->msix.arg, ppt->msix.arg_sz); + ppt->msix.arg = NULL; + ppt->msix.arg_sz = 0; + } + + ppt->msix.num_msgs = 0; +} + +int +ppt_assigned_devices(struct vm *vm) +{ + struct pptdev *ppt; + uint_t num = 0; + + mutex_enter(&pptdev_mtx); + for (ppt = list_head(&pptdev_list); ppt != NULL; + ppt = list_next(&pptdev_list, ppt)) { + if (ppt->vm == vm) { + num++; + } + } + mutex_exit(&pptdev_mtx); + return (num); +} + +boolean_t +ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) +{ + struct pptdev *ppt = list_head(&pptdev_list); + + /* XXX: this should probably be restructured to avoid the lock */ + mutex_enter(&pptdev_mtx); + for (ppt = list_head(&pptdev_list); ppt != NULL; + ppt = list_next(&pptdev_list, ppt)) { + if (ppt->vm != vm) { + continue; + } + + for (uint_t i = 0; i < MAX_MMIOSEGS; i++) { + struct pptseg *seg = &ppt->mmio[i]; + + if (seg->len == 0) + continue; + if (gpa >= seg->gpa && gpa < seg->gpa + seg->len) { + mutex_exit(&pptdev_mtx); + return (B_TRUE); + } + } + } + + mutex_exit(&pptdev_mtx); + return (B_FALSE); +} + +int +ppt_assign_device(struct vm *vm, int pptfd) +{ + struct pptdev *ppt; + int err = 0; + + mutex_enter(&pptdev_mtx); + ppt = ppt_findf(pptfd); + if (ppt == NULL) { + mutex_exit(&pptdev_mtx); + return (EBADF); + } + + /* Only one VM may own a device at any given time */ + if (ppt->vm != NULL && ppt->vm != vm) { + err = EBUSY; + goto done; + } + + if (pci_save_config_regs(ppt->pptd_dip) != DDI_SUCCESS) { + err = EIO; + goto done; + } + ppt_flr(ppt->pptd_dip, B_TRUE); + + /* + * Restore the device state after reset and then perform another save + * so the "pristine" state can be restored when the device is removed + * from the guest. + */ + if (pci_restore_config_regs(ppt->pptd_dip) != DDI_SUCCESS || + pci_save_config_regs(ppt->pptd_dip) != DDI_SUCCESS) { + err = EIO; + goto done; + } + + ppt->vm = vm; + iommu_remove_device(iommu_host_domain(), pci_get_bdf(ppt->pptd_dip)); + iommu_add_device(vm_iommu_domain(vm), pci_get_bdf(ppt->pptd_dip)); + pf_set_passthru(ppt->pptd_dip, B_TRUE); + +done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +static void +ppt_reset_pci_power_state(dev_info_t *dip) +{ + ddi_acc_handle_t cfg; + uint16_t cap_ptr; + + if (pci_config_setup(dip, &cfg) != DDI_SUCCESS) + return; + + if (PCI_CAP_LOCATE(cfg, PCI_CAP_ID_PM, &cap_ptr) == DDI_SUCCESS) { + uint16_t val; + + val = PCI_CAP_GET16(cfg, 0, cap_ptr, PCI_PMCSR); + if ((val & PCI_PMCSR_STATE_MASK) != PCI_PMCSR_D0) { + val = (val & ~PCI_PMCSR_STATE_MASK) | PCI_PMCSR_D0; + (void) PCI_CAP_PUT16(cfg, 0, cap_ptr, PCI_PMCSR, + val); + } + } + + pci_config_teardown(&cfg); +} + +static void +ppt_do_unassign(struct pptdev *ppt) +{ + struct vm *vm = ppt->vm; + + ASSERT3P(vm, !=, NULL); + ASSERT(MUTEX_HELD(&pptdev_mtx)); + + + ppt_flr(ppt->pptd_dip, B_TRUE); + + /* + * Restore from the state saved during device assignment. + * If the device power state has been altered, that must be remedied + * first, as it will reset register state during the transition. + */ + ppt_reset_pci_power_state(ppt->pptd_dip); + (void) pci_restore_config_regs(ppt->pptd_dip); + + pf_set_passthru(ppt->pptd_dip, B_FALSE); + + ppt_unmap_mmio(vm, ppt); + ppt_teardown_msi(ppt); + ppt_teardown_msix(ppt); + iommu_remove_device(vm_iommu_domain(vm), pci_get_bdf(ppt->pptd_dip)); + iommu_add_device(iommu_host_domain(), pci_get_bdf(ppt->pptd_dip)); + ppt->vm = NULL; +} + +int +ppt_unassign_device(struct vm *vm, int pptfd) +{ + struct pptdev *ppt; + int err = 0; + + mutex_enter(&pptdev_mtx); + ppt = ppt_findf(pptfd); + if (ppt == NULL) { + mutex_exit(&pptdev_mtx); + return (EBADF); + } + + /* If this device is not owned by this 'vm' then bail out. */ + if (ppt->vm != vm) { + err = EBUSY; + goto done; + } + ppt_do_unassign(ppt); + +done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +int +ppt_unassign_all(struct vm *vm) +{ + struct pptdev *ppt; + + mutex_enter(&pptdev_mtx); + for (ppt = list_head(&pptdev_list); ppt != NULL; + ppt = list_next(&pptdev_list, ppt)) { + if (ppt->vm == vm) { + ppt_do_unassign(ppt); + } + } + mutex_exit(&pptdev_mtx); + + return (0); +} + +int +ppt_map_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa) +{ + struct pptdev *ppt; + int err = 0; + + mutex_enter(&pptdev_mtx); + ppt = ppt_findf(pptfd); + if (ppt == NULL) { + mutex_exit(&pptdev_mtx); + return (EBADF); + } + if (ppt->vm != vm) { + err = EBUSY; + goto done; + } + + /* + * Ensure that the host-physical range of the requested mapping fits + * within one of the MMIO BARs of the device. + */ + if (!ppt_bar_verify_mmio(ppt, hpa, len)) { + err = EINVAL; + goto done; + } + + for (uint_t i = 0; i < MAX_MMIOSEGS; i++) { + struct pptseg *seg = &ppt->mmio[i]; + + if (seg->len == 0) { + err = vm_map_mmio(vm, gpa, len, hpa); + if (err == 0) { + seg->gpa = gpa; + seg->len = len; + } + goto done; + } + } + err = ENOSPC; + +done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +static uint_t +pptintr(caddr_t arg, caddr_t unused) +{ + struct pptintr_arg *pptarg = (struct pptintr_arg *)arg; + struct pptdev *ppt = pptarg->pptdev; + + if (ppt->vm != NULL) { + lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data); + } else { + /* + * XXX + * This is not expected to happen - panic? + */ + } + + /* + * For legacy interrupts give other filters a chance in case + * the interrupt was not generated by the passthrough device. + */ + return (ppt->msi.is_fixed ? DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED); +} + +int +ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr, uint64_t msg, + int numvec) +{ + int i, msi_count, intr_type; + struct pptdev *ppt; + int err = 0; + + if (numvec < 0 || numvec > MAX_MSIMSGS) + return (EINVAL); + + mutex_enter(&pptdev_mtx); + ppt = ppt_findf(pptfd); + if (ppt == NULL) { + mutex_exit(&pptdev_mtx); + return (EBADF); + } + if (ppt->vm != vm) { + /* Make sure we own this device */ + err = EBUSY; + goto done; + } + + /* Free any allocated resources */ + ppt_teardown_msi(ppt); + + if (numvec == 0) { + /* nothing more to do */ + goto done; + } + + if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSI, + &msi_count) != DDI_SUCCESS) { + if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_FIXED, + &msi_count) != DDI_SUCCESS) { + err = EINVAL; + goto done; + } + + intr_type = DDI_INTR_TYPE_FIXED; + ppt->msi.is_fixed = B_TRUE; + } else { + intr_type = DDI_INTR_TYPE_MSI; + } + + /* + * The device must be capable of supporting the number of vectors + * the guest wants to allocate. + */ + if (numvec > msi_count) { + err = EINVAL; + goto done; + } + + ppt->msi.inth_sz = numvec * sizeof (ddi_intr_handle_t); + ppt->msi.inth = kmem_zalloc(ppt->msi.inth_sz, KM_SLEEP); + if (ddi_intr_alloc(ppt->pptd_dip, ppt->msi.inth, intr_type, 0, + numvec, &msi_count, 0) != DDI_SUCCESS) { + kmem_free(ppt->msi.inth, ppt->msi.inth_sz); + err = EINVAL; + goto done; + } + + /* Verify that we got as many vectors as the guest requested */ + if (numvec != msi_count) { + ppt_teardown_msi(ppt); + err = EINVAL; + goto done; + } + + /* Set up & enable interrupt handler for each vector. */ + for (i = 0; i < numvec; i++) { + int res, intr_cap = 0; + + ppt->msi.num_msgs = i + 1; + ppt->msi.arg[i].pptdev = ppt; + ppt->msi.arg[i].addr = addr; + ppt->msi.arg[i].msg_data = msg + i; + + if (ddi_intr_add_handler(ppt->msi.inth[i], pptintr, + &ppt->msi.arg[i], NULL) != DDI_SUCCESS) + break; + + (void) ddi_intr_get_cap(ppt->msi.inth[i], &intr_cap); + if (intr_cap & DDI_INTR_FLAG_BLOCK) + res = ddi_intr_block_enable(&ppt->msi.inth[i], 1); + else + res = ddi_intr_enable(ppt->msi.inth[i]); + + if (res != DDI_SUCCESS) + break; + } + if (i < numvec) { + ppt_teardown_msi(ppt); + err = ENXIO; + } + +done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +int +ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr, + uint64_t msg, uint32_t vector_control) +{ + struct pptdev *ppt; + int numvec, alloced; + int err = 0; + + mutex_enter(&pptdev_mtx); + ppt = ppt_findf(pptfd); + if (ppt == NULL) { + mutex_exit(&pptdev_mtx); + return (EBADF); + } + /* Make sure we own this device */ + if (ppt->vm != vm) { + err = EBUSY; + goto done; + } + + /* + * First-time configuration: + * Allocate the MSI-X table + * Allocate the IRQ resources + * Set up some variables in ppt->msix + */ + if (ppt->msix.num_msgs == 0) { + dev_info_t *dip = ppt->pptd_dip; + + if (ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, + &numvec) != DDI_SUCCESS) { + err = EINVAL; + goto done; + } + + ppt->msix.num_msgs = numvec; + + ppt->msix.arg_sz = numvec * sizeof (ppt->msix.arg[0]); + ppt->msix.arg = kmem_zalloc(ppt->msix.arg_sz, KM_SLEEP); + ppt->msix.inth_sz = numvec * sizeof (ddi_intr_handle_t); + ppt->msix.inth = kmem_zalloc(ppt->msix.inth_sz, KM_SLEEP); + + if (ddi_intr_alloc(dip, ppt->msix.inth, DDI_INTR_TYPE_MSIX, 0, + numvec, &alloced, 0) != DDI_SUCCESS) { + kmem_free(ppt->msix.arg, ppt->msix.arg_sz); + kmem_free(ppt->msix.inth, ppt->msix.inth_sz); + ppt->msix.arg = NULL; + ppt->msix.inth = NULL; + ppt->msix.arg_sz = ppt->msix.inth_sz = 0; + err = EINVAL; + goto done; + } + + if (numvec != alloced) { + ppt_teardown_msix(ppt); + err = EINVAL; + goto done; + } + } + + if (idx >= ppt->msix.num_msgs) { + err = EINVAL; + goto done; + } + + if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + int intr_cap, res; + + /* Tear down the IRQ if it's already set up */ + ppt_teardown_msix_intr(ppt, idx); + + ppt->msix.arg[idx].pptdev = ppt; + ppt->msix.arg[idx].addr = addr; + ppt->msix.arg[idx].msg_data = msg; + + /* Setup the MSI-X interrupt */ + if (ddi_intr_add_handler(ppt->msix.inth[idx], pptintr, + &ppt->msix.arg[idx], NULL) != DDI_SUCCESS) { + err = ENXIO; + goto done; + } + + (void) ddi_intr_get_cap(ppt->msix.inth[idx], &intr_cap); + if (intr_cap & DDI_INTR_FLAG_BLOCK) + res = ddi_intr_block_enable(&ppt->msix.inth[idx], 1); + else + res = ddi_intr_enable(ppt->msix.inth[idx]); + + if (res != DDI_SUCCESS) { + ddi_intr_remove_handler(ppt->msix.inth[idx]); + err = ENXIO; + goto done; + } + } else { + /* Masked, tear it down if it's already been set up */ + ppt_teardown_msix_intr(ppt, idx); + } + +done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +int +ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit) +{ + struct pptdev *ppt; + int err = 0; + + mutex_enter(&pptdev_mtx); + ppt = ppt_findf(pptfd); + if (ppt == NULL) { + mutex_exit(&pptdev_mtx); + return (EBADF); + } + if (ppt->vm != vm) { + err = EBUSY; + goto done; + } + + if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSI, + msilimit) != DDI_SUCCESS) { + *msilimit = -1; + } + if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSIX, + msixlimit) != DDI_SUCCESS) { + *msixlimit = -1; + } + +done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.conf b/usr/src/uts/i86pc/io/vmm/io/ppt.conf new file mode 100644 index 0000000000..698cecb6f8 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/ppt.conf @@ -0,0 +1,14 @@ +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2017 Joyent, Inc. +# + diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.h b/usr/src/uts/i86pc/io/vmm/io/ppt.h new file mode 100644 index 0000000000..979c0e18ac --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/ppt.h @@ -0,0 +1,51 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_PPT_H_ +#define _IO_PPT_H_ + +int ppt_unassign_all(struct vm *vm); +int ppt_map_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); +int ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr, + uint64_t msg, int numvec); +int ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr, + uint64_t msg, uint32_t vector_control); +int ppt_assigned_devices(struct vm *vm); +boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); +int ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit); + +/* + * The following functions should never be called directly. + * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead. + */ +int ppt_assign_device(struct vm *vm, int pptfd); +int ppt_unassign_device(struct vm *vm, int pptfd); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile b/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile new file mode 100644 index 0000000000..aac896e89e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile @@ -0,0 +1,52 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # PCI pass-thru API for bhyve + ppt_assigned_devices; + ppt_is_mmio; + ppt_assign_device; + ppt_unassign_device; + ppt_unassign_all; + ppt_map_mmio; + ppt_setup_msi; + ppt_setup_msix; + ppt_get_limits; + + local: + *; +}; diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.c b/usr/src/uts/i86pc/io/vmm/io/vatpic.c new file mode 100644 index 0000000000..ba4cd7785e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.c @@ -0,0 +1,810 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <x86/apicreg.h> +#include <dev/ic/i8259.h> + +#include <machine/vmm.h> + +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vioapic.h" +#include "vatpic.h" + +static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)"); + +#define VATPIC_LOCK(vatpic) mtx_lock_spin(&((vatpic)->mtx)) +#define VATPIC_UNLOCK(vatpic) mtx_unlock_spin(&((vatpic)->mtx)) +#define VATPIC_LOCKED(vatpic) mtx_owned(&((vatpic)->mtx)) + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +struct atpic { + bool ready; + int icw_num; + int rd_cmd_reg; + + bool aeoi; + bool poll; + bool rotate; + bool sfn; /* special fully-nested mode */ + + int irq_base; + uint8_t request; /* Interrupt Request Register (IIR) */ + uint8_t service; /* Interrupt Service (ISR) */ + uint8_t mask; /* Interrupt Mask Register (IMR) */ + uint8_t smm; /* special mask mode */ + + int acnt[8]; /* sum of pin asserts and deasserts */ + int lowprio; /* lowest priority irq */ + + bool intr_raised; +}; + +struct vatpic { + struct vm *vm; + struct mtx mtx; + struct atpic atpic[2]; + uint8_t elc[2]; +}; + +#define VATPIC_CTR0(vatpic, fmt) \ + VM_CTR0((vatpic)->vm, fmt) + +#define VATPIC_CTR1(vatpic, fmt, a1) \ + VM_CTR1((vatpic)->vm, fmt, a1) + +#define VATPIC_CTR2(vatpic, fmt, a1, a2) \ + VM_CTR2((vatpic)->vm, fmt, a1, a2) + +#define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \ + VM_CTR3((vatpic)->vm, fmt, a1, a2, a3) + +#define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4) + +/* + * Loop over all the pins in priority order from highest to lowest. + */ +#define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \ + for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \ + tmpvar < 8; \ + tmpvar++, pinvar = (pinvar + 1) & 0x7) + +static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate); + +static __inline bool +master_atpic(struct vatpic *vatpic, struct atpic *atpic) +{ + + if (atpic == &vatpic->atpic[0]) + return (true); + else + return (false); +} + +static __inline int +vatpic_get_highest_isrpin(struct atpic *atpic) +{ + int bit, pin; + int i; + + ATPIC_PIN_FOREACH(pin, atpic, i) { + bit = (1 << pin); + + if (atpic->service & bit) { + /* + * An IS bit that is masked by an IMR bit will not be + * cleared by a non-specific EOI in Special Mask Mode. + */ + if (atpic->smm && (atpic->mask & bit) != 0) + continue; + else + return (pin); + } + } + + return (-1); +} + +static __inline int +vatpic_get_highest_irrpin(struct atpic *atpic) +{ + int serviced; + int bit, pin, tmp; + + /* + * In 'Special Fully-Nested Mode' when an interrupt request from + * a slave is in service, the slave is not locked out from the + * master's priority logic. + */ + serviced = atpic->service; + if (atpic->sfn) + serviced &= ~(1 << 2); + + /* + * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits + * further interrupts at that level and enables interrupts from all + * other levels that are not masked. In other words the ISR has no + * bearing on the levels that can generate interrupts. + */ + if (atpic->smm) + serviced = 0; + + ATPIC_PIN_FOREACH(pin, atpic, tmp) { + bit = 1 << pin; + + /* + * If there is already an interrupt in service at the same + * or higher priority then bail. + */ + if ((serviced & bit) != 0) + break; + + /* + * If an interrupt is asserted and not masked then return + * the corresponding 'pin' to the caller. + */ + if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0) + return (pin); + } + + return (-1); +} + +static void +vatpic_notify_intr(struct vatpic *vatpic) +{ + struct atpic *atpic; + int pin; + + KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked")); + + /* + * First check the slave. + */ + atpic = &vatpic->atpic[1]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic slave notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->mask, atpic->request, atpic->service); + + /* + * Cascade the request from the slave to the master. + */ + atpic->intr_raised = true; + vatpic_set_pinstate(vatpic, 2, true); + vatpic_set_pinstate(vatpic, 2, false); + } else { + VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->mask, atpic->request, atpic->service); + } + + /* + * Then check the master. + */ + atpic = &vatpic->atpic[0]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic master notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->mask, atpic->request, atpic->service); + + /* + * From Section 3.6.2, "Interrupt Modes", in the + * MPtable Specification, Version 1.4 + * + * PIC interrupts are routed to both the Local APIC + * and the I/O APIC to support operation in 1 of 3 + * modes. + * + * 1. Legacy PIC Mode: the PIC effectively bypasses + * all APIC components. In this mode the local APIC is + * disabled and LINT0 is reconfigured as INTR to + * deliver the PIC interrupt directly to the CPU. + * + * 2. Virtual Wire Mode: the APIC is treated as a + * virtual wire which delivers interrupts from the PIC + * to the CPU. In this mode LINT0 is programmed as + * ExtINT to indicate that the PIC is the source of + * the interrupt. + * + * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are + * fielded by the I/O APIC and delivered to the appropriate + * CPU. In this mode the I/O APIC input 0 is programmed + * as ExtINT to indicate that the PIC is the source of the + * interrupt. + */ + atpic->intr_raised = true; + lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0); + vioapic_pulse_irq(vatpic->vm, 0); + } else { + VATPIC_CTR3(vatpic, "atpic master no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->mask, atpic->request, atpic->service); + } +} + +static int +vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val); + + atpic->ready = false; + + atpic->icw_num = 1; + atpic->request = 0; + atpic->mask = 0; + atpic->lowprio = 7; + atpic->rd_cmd_reg = 0; + atpic->poll = 0; + atpic->smm = 0; + + if ((val & ICW1_SNGL) != 0) { + VATPIC_CTR0(vatpic, "vatpic cascade mode required"); + return (-1); + } + + if ((val & ICW1_IC4) == 0) { + VATPIC_CTR0(vatpic, "vatpic icw4 required"); + return (-1); + } + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val); + + atpic->irq_base = val & 0xf8; + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val); + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val); + + if ((val & ICW4_8086) == 0) { + VATPIC_CTR0(vatpic, "vatpic microprocessor mode required"); + return (-1); + } + + if ((val & ICW4_AEOI) != 0) + atpic->aeoi = true; + + if ((val & ICW4_SFNM) != 0) { + if (master_atpic(vatpic, atpic)) { + atpic->sfn = true; + } else { + VATPIC_CTR1(vatpic, "Ignoring special fully nested " + "mode on slave atpic: %#x", val); + } + } + + atpic->icw_num = 0; + atpic->ready = true; + + return (0); +} + +static int +vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val); + + atpic->mask = val & 0xff; + + return (0); +} + +static int +vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val); + + atpic->rotate = ((val & OCW2_R) != 0); + + if ((val & OCW2_EOI) != 0) { + int isr_bit; + + if ((val & OCW2_SL) != 0) { + /* specific EOI */ + isr_bit = val & 0x7; + } else { + /* non-specific EOI */ + isr_bit = vatpic_get_highest_isrpin(atpic); + } + + if (isr_bit != -1) { + atpic->service &= ~(1 << isr_bit); + + if (atpic->rotate) + atpic->lowprio = isr_bit; + } + } else if ((val & OCW2_SL) != 0 && atpic->rotate == true) { + /* specific priority */ + atpic->lowprio = val & 0x7; + } + + return (0); +} + +static int +vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val); + + if (val & OCW3_ESMM) { + atpic->smm = val & OCW3_SMM ? 1 : 0; + VATPIC_CTR2(vatpic, "%s atpic special mask mode %s", + master_atpic(vatpic, atpic) ? "master" : "slave", + atpic->smm ? "enabled" : "disabled"); + } + + if (val & OCW3_RR) { + /* read register command */ + atpic->rd_cmd_reg = val & OCW3_RIS; + + /* Polling mode */ + atpic->poll = ((val & OCW3_P) != 0); + } + + return (0); +} + +static void +vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate) +{ + struct atpic *atpic; + int oldcnt, newcnt; + bool level; + + KASSERT(pin >= 0 && pin < 16, + ("vatpic_set_pinstate: invalid pin number %d", pin)); + KASSERT(VATPIC_LOCKED(vatpic), + ("vatpic_set_pinstate: vatpic is not locked")); + + atpic = &vatpic->atpic[pin >> 3]; + + oldcnt = atpic->acnt[pin & 0x7]; + if (newstate) + atpic->acnt[pin & 0x7]++; + else + atpic->acnt[pin & 0x7]--; + newcnt = atpic->acnt[pin & 0x7]; + + if (newcnt < 0) { + VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt); + } + + level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0); + + if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) { + /* rising edge or level */ + VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin); + atpic->request |= (1 << (pin & 0x7)); + } else if (oldcnt == 1 && newcnt == 0) { + /* falling edge */ + VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin); + if (level) + atpic->request &= ~(1 << (pin & 0x7)); + } else { + VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d", + pin, newstate ? "asserted" : "deasserted", newcnt); + } + + vatpic_notify_intr(vatpic); +} + +static int +vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + if (irq < 0 || irq > 15) + return (EINVAL); + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[irq >> 3]; + + if (atpic->ready == false) + return (0); + + VATPIC_LOCK(vatpic); + switch (irqstate) { + case IRQSTATE_ASSERT: + vatpic_set_pinstate(vatpic, irq, true); + break; + case IRQSTATE_DEASSERT: + vatpic_set_pinstate(vatpic, irq, false); + break; + case IRQSTATE_PULSE: + vatpic_set_pinstate(vatpic, irq, true); + vatpic_set_pinstate(vatpic, irq, false); + break; + default: + panic("vatpic_set_irqstate: invalid irqstate %d", irqstate); + } + VATPIC_UNLOCK(vatpic); + + return (0); +} + +int +vatpic_assert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vatpic_deassert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vatpic_pulse_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +int +vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger) +{ + struct vatpic *vatpic; + + if (irq < 0 || irq > 15) + return (EINVAL); + + /* + * See comment in vatpic_elc_handler. These IRQs must be + * edge triggered. + */ + if (trigger == LEVEL_TRIGGER) { + switch (irq) { + case 0: + case 1: + case 2: + case 8: + case 13: + return (EINVAL); + } + } + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + if (trigger == LEVEL_TRIGGER) + vatpic->elc[irq >> 3] |= 1 << (irq & 0x7); + else + vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7)); + + VATPIC_UNLOCK(vatpic); + + return (0); +} + +void +vatpic_pending_intr(struct vm *vm, int *vecptr) +{ + struct vatpic *vatpic; + struct atpic *atpic; + int pin; + + vatpic = vm_atpic(vm); + + atpic = &vatpic->atpic[0]; + + VATPIC_LOCK(vatpic); + + pin = vatpic_get_highest_irrpin(atpic); + if (pin == 2) { + atpic = &vatpic->atpic[1]; + pin = vatpic_get_highest_irrpin(atpic); + } + + /* + * If there are no pins active at this moment then return the spurious + * interrupt vector instead. + */ + if (pin == -1) + pin = 7; + + KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin)); + *vecptr = atpic->irq_base + pin; + + VATPIC_UNLOCK(vatpic); +} + +static void +vatpic_pin_accepted(struct atpic *atpic, int pin) +{ + atpic->intr_raised = false; + + if (atpic->acnt[pin] == 0) + atpic->request &= ~(1 << pin); + + if (atpic->aeoi == true) { + if (atpic->rotate == true) + atpic->lowprio = pin; + } else { + atpic->service |= (1 << pin); + } +} + +void +vatpic_intr_accepted(struct vm *vm, int vector) +{ + struct vatpic *vatpic; + int pin; + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + pin = vector & 0x7; + + if ((vector & ~0x7) == vatpic->atpic[1].irq_base) { + vatpic_pin_accepted(&vatpic->atpic[1], pin); + /* + * If this vector originated from the slave, + * accept the cascaded interrupt too. + */ + vatpic_pin_accepted(&vatpic->atpic[0], 2); + } else { + vatpic_pin_accepted(&vatpic->atpic[0], pin); + } + + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); +} + +static int +vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, + int bytes, uint32_t *eax) +{ + int pin; + + VATPIC_LOCK(vatpic); + + if (atpic->poll) { + atpic->poll = 0; + pin = vatpic_get_highest_irrpin(atpic); + if (pin >= 0) { + vatpic_pin_accepted(atpic, pin); + *eax = 0x80 | pin; + } else { + *eax = 0; + } + } else { + if (port & ICU_IMR_OFFSET) { + /* read interrrupt mask register */ + *eax = atpic->mask; + } else { + if (atpic->rd_cmd_reg == OCW3_RIS) { + /* read interrupt service register */ + *eax = atpic->service; + } else { + /* read interrupt request register */ + *eax = atpic->request; + } + } + } + + VATPIC_UNLOCK(vatpic); + + return (0); + +} + +static int +vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, + int bytes, uint32_t *eax) +{ + int error; + uint8_t val; + + error = 0; + val = *eax; + + VATPIC_LOCK(vatpic); + + if (port & ICU_IMR_OFFSET) { + switch (atpic->icw_num) { + case 2: + error = vatpic_icw2(vatpic, atpic, val); + break; + case 3: + error = vatpic_icw3(vatpic, atpic, val); + break; + case 4: + error = vatpic_icw4(vatpic, atpic, val); + break; + default: + error = vatpic_ocw1(vatpic, atpic, val); + break; + } + } else { + if (val & (1 << 4)) + error = vatpic_icw1(vatpic, atpic, val); + + if (atpic->ready) { + if (val & (1 << 3)) + error = vatpic_ocw3(vatpic, atpic, val); + else + error = vatpic_ocw2(vatpic, atpic, val); + } + } + + if (atpic->ready) + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); + + return (error); +} + +int +vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[0]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[1]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + bool is_master; + + vatpic = vm_atpic(vm); + is_master = (port == IO_ELCR1); + + if (bytes != 1) + return (-1); + + VATPIC_LOCK(vatpic); + + if (in) { + if (is_master) + *eax = vatpic->elc[0]; + else + *eax = vatpic->elc[1]; + } else { + /* + * For the master PIC the cascade channel (IRQ2), the + * heart beat timer (IRQ0), and the keyboard + * controller (IRQ1) cannot be programmed for level + * mode. + * + * For the slave PIC the real time clock (IRQ8) and + * the floating point error interrupt (IRQ13) cannot + * be programmed for level mode. + */ + if (is_master) + vatpic->elc[0] = (*eax & 0xf8); + else + vatpic->elc[1] = (*eax & 0xde); + } + + VATPIC_UNLOCK(vatpic); + + return (0); +} + +struct vatpic * +vatpic_init(struct vm *vm) +{ + struct vatpic *vatpic; + + vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO); + vatpic->vm = vm; + + mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN); + + return (vatpic); +} + +void +vatpic_cleanup(struct vatpic *vatpic) +{ + free(vatpic, M_VATPIC); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.h b/usr/src/uts/i86pc/io/vmm/io/vatpic.h new file mode 100644 index 0000000000..d4a1be1820 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VATPIC_H_ +#define _VATPIC_H_ + +#include <isa/isareg.h> + +#define ICU_IMR_OFFSET 1 + +#define IO_ELCR1 0x4d0 +#define IO_ELCR2 0x4d1 + +struct vatpic *vatpic_init(struct vm *vm); +void vatpic_cleanup(struct vatpic *vatpic); + +int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); +int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); +int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax); + +int vatpic_assert_irq(struct vm *vm, int irq); +int vatpic_deassert_irq(struct vm *vm, int irq); +int vatpic_pulse_irq(struct vm *vm, int irq); +int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger); + +void vatpic_pending_intr(struct vm *vm, int *vecptr); +void vatpic_intr_accepted(struct vm *vm, int vector); + +#endif /* _VATPIC_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c new file mode 100644 index 0000000000..9b3e7376d5 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c @@ -0,0 +1,487 @@ +/*- + * Copyright (c) 2018 Joyent, Inc. + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <machine/vmm.h> + +#include "vmm_ktr.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vatpit.h" + +static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)"); + +#define VATPIT_LOCK(vatpit) mtx_lock_spin(&((vatpit)->mtx)) +#define VATPIT_UNLOCK(vatpit) mtx_unlock_spin(&((vatpit)->mtx)) +#define VATPIT_LOCKED(vatpit) mtx_owned(&((vatpit)->mtx)) + +#define TIMER_SEL_MASK 0xc0 +#define TIMER_RW_MASK 0x30 +#define TIMER_MODE_MASK 0x0f +#define TIMER_SEL_READBACK 0xc0 + +#define TIMER_STS_OUT 0x80 +#define TIMER_STS_NULLCNT 0x40 + +#define TIMER_RB_LCTR 0x20 +#define TIMER_RB_LSTATUS 0x10 +#define TIMER_RB_CTR_2 0x08 +#define TIMER_RB_CTR_1 0x04 +#define TIMER_RB_CTR_0 0x02 + +#define TMR2_OUT_STS 0x20 + +#define PIT_8254_FREQ 1193182 +#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz)) + +struct vatpit_callout_arg { + struct vatpit *vatpit; + int channel_num; +}; + + +struct channel { + int mode; + uint16_t initial; /* initial counter value */ + struct bintime now_bt; /* uptime when counter was loaded */ + uint8_t cr[2]; + uint8_t ol[2]; + bool slatched; /* status latched */ + uint8_t status; + int crbyte; + int olbyte; + int frbyte; + struct callout callout; + struct bintime callout_bt; /* target time */ + struct vatpit_callout_arg callout_arg; +}; + +struct vatpit { + struct vm *vm; + struct mtx mtx; + + struct bintime freq_bt; + + struct channel channel[3]; +}; + +static void pit_timer_start_cntr0(struct vatpit *vatpit); + +static uint64_t +vatpit_delta_ticks(struct vatpit *vatpit, struct channel *c) +{ + struct bintime delta; + uint64_t result; + + binuptime(&delta); + bintime_sub(&delta, &c->now_bt); + + result = delta.sec * PIT_8254_FREQ; + result += delta.frac / vatpit->freq_bt.frac; + + return (result); +} + +static int +vatpit_get_out(struct vatpit *vatpit, int channel) +{ + struct channel *c; + uint64_t delta_ticks; + int out; + + c = &vatpit->channel[channel]; + + switch (c->mode) { + case TIMER_INTTC: + delta_ticks = vatpit_delta_ticks(vatpit, c); + out = (delta_ticks >= c->initial); + break; + default: + out = 0; + break; + } + + return (out); +} + +static void +vatpit_callout_handler(void *a) +{ + struct vatpit_callout_arg *arg = a; + struct vatpit *vatpit; + struct callout *callout; + struct channel *c; + + vatpit = arg->vatpit; + c = &vatpit->channel[arg->channel_num]; + callout = &c->callout; + + VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num); + + VATPIT_LOCK(vatpit); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (c->mode == TIMER_RATEGEN) { + pit_timer_start_cntr0(vatpit); + } + + vatpic_pulse_irq(vatpit->vm, 0); + vioapic_pulse_irq(vatpit->vm, 2); + +done: + VATPIT_UNLOCK(vatpit); + return; +} + +static void +pit_timer_start_cntr0(struct vatpit *vatpit) +{ + struct channel *c; + + c = &vatpit->channel[0]; + if (c->initial != 0) { + sbintime_t precision; + struct bintime now, delta; + + delta.sec = 0; + delta.frac = vatpit->freq_bt.frac * c->initial; + bintime_add(&c->callout_bt, &delta); + precision = bttosbt(delta) >> tc_precexp; + + /* + * Reset 'callout_bt' if the time that the callout was supposed + * to fire is more than 'c->initial' ticks in the past. + */ + binuptime(&now); + if (bintime_cmp(&c->callout_bt, &now, <)) { + c->callout_bt = now; + bintime_add(&c->callout_bt, &delta); + } + + callout_reset_sbt(&c->callout, bttosbt(c->callout_bt), + precision, vatpit_callout_handler, &c->callout_arg, + C_ABSOLUTE); + } +} + +static uint16_t +pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) +{ + uint16_t lval; + uint64_t delta_ticks; + + /* cannot latch a new value until the old one has been consumed */ + if (latch && c->olbyte != 0) + return (0); + + if (c->initial == 0) { + /* + * This is possibly an o/s bug - reading the value of + * the timer without having set up the initial value. + * + * The original user-space version of this code set + * the timer to 100hz in this condition; do the same + * here. + */ + c->initial = TIMER_DIV(PIT_8254_FREQ, 100); + binuptime(&c->now_bt); + c->status &= ~TIMER_STS_NULLCNT; + } + + delta_ticks = vatpit_delta_ticks(vatpit, c); + lval = c->initial - delta_ticks % c->initial; + + if (latch) { + c->olbyte = 2; + c->ol[1] = lval; /* LSB */ + c->ol[0] = lval >> 8; /* MSB */ + } + + return (lval); +} + +static int +pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd) +{ + struct channel *c; + + c = &vatpit->channel[channel]; + + /* + * Latch the count/status of the timer if not already latched. + * N.B. that the count/status latch-select bits are active-low. + */ + if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) { + (void) pit_update_counter(vatpit, c, true); + } + + if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) { + c->slatched = true; + /* + * For mode 0, see if the elapsed time is greater + * than the initial value - this results in the + * output pin being set to 1 in the status byte. + */ + if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel)) + c->status |= TIMER_STS_OUT; + else + c->status &= ~TIMER_STS_OUT; + } + + return (0); +} + +static int +pit_readback(struct vatpit *vatpit, uint8_t cmd) +{ + int error; + + /* + * The readback command can apply to all timers. + */ + error = 0; + if (cmd & TIMER_RB_CTR_0) + error = pit_readback1(vatpit, 0, cmd); + if (!error && cmd & TIMER_RB_CTR_1) + error = pit_readback1(vatpit, 1, cmd); + if (!error && cmd & TIMER_RB_CTR_2) + error = pit_readback1(vatpit, 2, cmd); + + return (error); +} + + +static int +vatpit_update_mode(struct vatpit *vatpit, uint8_t val) +{ + struct channel *c; + int sel, rw, mode; + + sel = val & TIMER_SEL_MASK; + rw = val & TIMER_RW_MASK; + mode = val & TIMER_MODE_MASK; + + if (sel == TIMER_SEL_READBACK) + return (pit_readback(vatpit, val)); + + if (rw != TIMER_LATCH && rw != TIMER_16BIT) + return (-1); + + if (rw != TIMER_LATCH) { + /* + * Counter mode is not affected when issuing a + * latch command. + */ + if (mode != TIMER_INTTC && + mode != TIMER_RATEGEN && + mode != TIMER_SQWAVE && + mode != TIMER_SWSTROBE) + return (-1); + } + + c = &vatpit->channel[sel >> 6]; + if (rw == TIMER_LATCH) + pit_update_counter(vatpit, c, true); + else { + c->mode = mode; + c->olbyte = 0; /* reset latch after reprogramming */ + c->status |= TIMER_STS_NULLCNT; + } + + return (0); +} + +int +vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpit *vatpit; + struct channel *c; + uint8_t val; + int error; + + vatpit = vm_atpit(vm); + + if (bytes != 1) + return (-1); + + val = *eax; + + if (port == TIMER_MODE) { + if (in) { + VM_CTR0(vatpit->vm, "vatpit attempt to read mode"); + return (-1); + } + + VATPIT_LOCK(vatpit); + error = vatpit_update_mode(vatpit, val); + VATPIT_UNLOCK(vatpit); + + return (error); + } + + /* counter ports */ + KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2, + ("invalid port 0x%x", port)); + c = &vatpit->channel[port - TIMER_CNTR0]; + + VATPIT_LOCK(vatpit); + if (in && c->slatched) { + /* + * Return the status byte if latched + */ + *eax = c->status; + c->slatched = false; + c->status = 0; + } else if (in) { + /* + * The spec says that once the output latch is completely + * read it should revert to "following" the counter. Use + * the free running counter for this case (i.e. Linux + * TSC calibration). Assuming the access mode is 16-bit, + * toggle the MSB/LSB bit on each read. + */ + if (c->olbyte == 0) { + uint16_t tmp; + + tmp = pit_update_counter(vatpit, c, false); + if (c->frbyte) + tmp >>= 8; + tmp &= 0xff; + *eax = tmp; + c->frbyte ^= 1; + } else + *eax = c->ol[--c->olbyte]; + } else { + c->cr[c->crbyte++] = *eax; + if (c->crbyte == 2) { + c->status &= ~TIMER_STS_NULLCNT; + c->frbyte = 0; + c->crbyte = 0; + c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8; + binuptime(&c->now_bt); + /* Start an interval timer for channel 0 */ + if (port == TIMER_CNTR0) { + c->callout_bt = c->now_bt; + pit_timer_start_cntr0(vatpit); + } + if (c->initial == 0) + c->initial = 0xffff; + } + } + VATPIT_UNLOCK(vatpit); + + return (0); +} + +int +vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpit *vatpit; + + vatpit = vm_atpit(vm); + + if (in) { + VATPIT_LOCK(vatpit); + if (vatpit_get_out(vatpit, 2)) + *eax = TMR2_OUT_STS; + else + *eax = 0; + + VATPIT_UNLOCK(vatpit); + } + + return (0); +} + +struct vatpit * +vatpit_init(struct vm *vm) +{ + struct vatpit *vatpit; + struct vatpit_callout_arg *arg; + int i; + + vatpit = malloc(sizeof(struct vatpit), M_VATPIT, M_WAITOK | M_ZERO); + vatpit->vm = vm; + + mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN); + + FREQ2BT(PIT_8254_FREQ, &vatpit->freq_bt); + + for (i = 0; i < 3; i++) { + callout_init(&vatpit->channel[i].callout, 1); + arg = &vatpit->channel[i].callout_arg; + arg->vatpit = vatpit; + arg->channel_num = i; + } + + return (vatpit); +} + +void +vatpit_cleanup(struct vatpit *vatpit) +{ + int i; + + for (i = 0; i < 3; i++) + callout_drain(&vatpit->channel[i].callout); + + free(vatpit, M_VATPIT); +} + +#ifndef __FreeBSD__ +void +vatpit_localize_resources(struct vatpit *vatpit) +{ + for (uint_t i = 0; i < 3; i++) { + /* Only localize channels which might be running */ + if (vatpit->channel[i].mode != 0) { + vmm_glue_callout_localize(&vatpit->channel[i].callout); + } + } +} +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.h b/usr/src/uts/i86pc/io/vmm/io/vatpit.h new file mode 100644 index 0000000000..4bf9fe048d --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.h @@ -0,0 +1,51 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VATPIT_H_ +#define _VATPIT_H_ + +#include <machine/timerreg.h> + +#define NMISC_PORT 0x61 + +struct vatpit *vatpit_init(struct vm *vm); +void vatpit_cleanup(struct vatpit *vatpit); + +int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax); +int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); + +#ifndef __FreeBSD__ +void vatpit_localize_resources(struct vatpit *); +#endif + +#endif /* _VATPIT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vdev.c b/usr/src/uts/i86pc/io/vmm/io/vdev.c new file mode 100644 index 0000000000..0f835625f3 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vdev.c @@ -0,0 +1,282 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vdev.c 245678 2013-01-20 03:42:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vdev.c 245678 2013-01-20 03:42:49Z neel $"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include "vdev.h" + +struct vdev { + SLIST_ENTRY(vdev) entry; + struct vdev_ops *ops; + void *dev; +}; +static SLIST_HEAD(, vdev) vdev_head; +static int vdev_count; + +struct vdev_region { + SLIST_ENTRY(vdev_region) entry; + struct vdev_ops *ops; + void *dev; + struct io_region *io; +}; +static SLIST_HEAD(, vdev_region) region_head; +static int region_count; + +static MALLOC_DEFINE(M_VDEV, "vdev", "vdev"); + +#define VDEV_INIT (0) +#define VDEV_RESET (1) +#define VDEV_HALT (2) + +// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"}; + +static int +vdev_system_event(int event) +{ + struct vdev *vd; + int rc; + + // TODO: locking + SLIST_FOREACH(vd, &vdev_head, entry) { + // printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name); + switch (event) { + case VDEV_INIT: + rc = vd->ops->init(vd->dev); + break; + case VDEV_RESET: + rc = vd->ops->reset(vd->dev); + break; + case VDEV_HALT: + rc = vd->ops->halt(vd->dev); + break; + default: + break; + } + if (rc) { + printf("vdev %s init failed rc=%d\n", + vd->ops->name, rc); + return rc; + } + } + return 0; +} + +int +vdev_init(void) +{ + return vdev_system_event(VDEV_INIT); +} + +int +vdev_reset(void) +{ + return vdev_system_event(VDEV_RESET); +} + +int +vdev_halt(void) +{ + return vdev_system_event(VDEV_HALT); +} + +void +vdev_vm_init(void) +{ + SLIST_INIT(&vdev_head); + vdev_count = 0; + + SLIST_INIT(®ion_head); + region_count = 0; +} +void +vdev_vm_cleanup(void) +{ + struct vdev *vd; + + // TODO: locking + while (!SLIST_EMPTY(&vdev_head)) { + vd = SLIST_FIRST(&vdev_head); + SLIST_REMOVE_HEAD(&vdev_head, entry); + free(vd, M_VDEV); + vdev_count--; + } +} + +int +vdev_register(struct vdev_ops *ops, void *dev) +{ + struct vdev *vd; + vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO); + vd->ops = ops; + vd->dev = dev; + + // TODO: locking + SLIST_INSERT_HEAD(&vdev_head, vd, entry); + vdev_count++; + return 0; +} + +void +vdev_unregister(void *dev) +{ + struct vdev *vd, *found; + + found = NULL; + // TODO: locking + SLIST_FOREACH(vd, &vdev_head, entry) { + if (vd->dev == dev) { + found = vd; + } + } + + if (found) { + SLIST_REMOVE(&vdev_head, found, vdev, entry); + free(found, M_VDEV); + } +} + +#define IN_RANGE(val, start, end) \ + (((val) >= (start)) && ((val) < (end))) + +static struct vdev_region* +vdev_find_region(struct io_region *io, void *dev) +{ + struct vdev_region *region, *found; + uint64_t region_base; + uint64_t region_end; + + found = NULL; + + // TODO: locking + // FIXME: we should verify we are in the context the current + // vcpu here as well. + SLIST_FOREACH(region, ®ion_head, entry) { + region_base = region->io->base; + region_end = region_base + region->io->len; + if (IN_RANGE(io->base, region_base, region_end) && + IN_RANGE(io->base+io->len, region_base, region_end+1) && + (dev && dev == region->dev)) { + found = region; + break; + } + } + return found; +} + +int +vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io) +{ + struct vdev_region *region; + + region = vdev_find_region(io, dev); + if (region) { + return -EEXIST; + } + + region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO); + region->io = io; + region->ops = ops; + region->dev = dev; + + // TODO: locking + SLIST_INSERT_HEAD(®ion_head, region, entry); + region_count++; + + return 0; +} + +void +vdev_unregister_region(void *dev, struct io_region *io) +{ + struct vdev_region *region; + + region = vdev_find_region(io, dev); + + if (region) { + SLIST_REMOVE(®ion_head, region, vdev_region, entry); + free(region, M_VDEV); + region_count--; + } +} + +static int +vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read) +{ + struct vdev_region *region; + struct io_region io; + region_attr_t attr; + int rc; + + io.base = gpa; + io.len = size; + + region = vdev_find_region(&io, NULL); + if (!region) + return -EINVAL; + + attr = (read) ? MMIO_READ : MMIO_WRITE; + if (!(region->io->attr & attr)) + return -EPERM; + + if (read) + rc = region->ops->memread(region->dev, gpa, size, data); + else + rc = region->ops->memwrite(region->dev, gpa, size, *data); + + return rc; +} + +int +vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data) +{ + return vdev_memrw(gpa, size, data, 1); +} + +int +vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data) +{ + return vdev_memrw(gpa, size, &data, 0); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vdev.h b/usr/src/uts/i86pc/io/vmm/io/vdev.h new file mode 100644 index 0000000000..dd2df75ad8 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vdev.h @@ -0,0 +1,96 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vdev.h 245678 2013-01-20 03:42:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _VDEV_H_ +#define _VDEV_H_ + +typedef enum { + BYTE = 1, + WORD = 2, + DWORD = 4, + QWORD = 8, +} opsize_t; + +typedef enum { + MMIO_READ = 1, + MMIO_WRITE = 2, +} region_attr_t; + +struct io_region { + uint64_t base; + uint64_t len; + region_attr_t attr; + int vcpu; +}; + +typedef int (*vdev_init_t)(void* dev); +typedef int (*vdev_reset_t)(void* dev); +typedef int (*vdev_halt_t)(void* dev); +typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data); +typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data); + + +struct vdev_ops { + const char *name; + vdev_init_t init; + vdev_reset_t reset; + vdev_halt_t halt; + vdev_memread_t memread; + vdev_memwrite_t memwrite; +}; + + +void vdev_vm_init(void); +void vdev_vm_cleanup(void); + +int vdev_register(struct vdev_ops *ops, void *dev); +void vdev_unregister(void *dev); + +int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io); +void vdev_unregister_region(void *dev, struct io_region *io); + +int vdev_init(void); +int vdev_reset(void); +int vdev_halt(void); +int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data); +int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data); + +#endif /* _VDEV_H_ */ + diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.c b/usr/src/uts/i86pc/io/vmm/io/vhpet.c new file mode 100644 index 0000000000..c82b4626bd --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.c @@ -0,0 +1,781 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <dev/acpica/acpi_hpet.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include "vmm_lapic.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vhpet.h" + +#include "vmm_ktr.h" + +static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet"); + +#define HPET_FREQ 16777216 /* 16.7 (2^24) Mhz */ +#define FS_PER_S 1000000000000000ul + +/* Timer N Configuration and Capabilities Register */ +#define HPET_TCAP_RO_MASK (HPET_TCAP_INT_ROUTE | \ + HPET_TCAP_FSB_INT_DEL | \ + HPET_TCAP_SIZE | \ + HPET_TCAP_PER_INT) +/* + * HPET requires at least 3 timers and up to 32 timers per block. + */ +#define VHPET_NUM_TIMERS 8 +CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32); + +struct vhpet_callout_arg { + struct vhpet *vhpet; + int timer_num; +}; + +struct vhpet { + struct vm *vm; + struct mtx mtx; + sbintime_t freq_sbt; + + uint64_t config; /* Configuration */ + uint64_t isr; /* Interrupt Status */ + uint32_t countbase; /* HPET counter base value */ + sbintime_t countbase_sbt; /* uptime corresponding to base value */ + + struct { + uint64_t cap_config; /* Configuration */ + uint64_t msireg; /* FSB interrupt routing */ + uint32_t compval; /* Comparator */ + uint32_t comprate; + struct callout callout; + sbintime_t callout_sbt; /* time when counter==compval */ + struct vhpet_callout_arg arg; + } timer[VHPET_NUM_TIMERS]; +}; + +#define VHPET_LOCK(vhp) mtx_lock(&((vhp)->mtx)) +#define VHPET_UNLOCK(vhp) mtx_unlock(&((vhp)->mtx)) + +static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, + sbintime_t now); + +static uint64_t +vhpet_capabilities(void) +{ + uint64_t cap = 0; + + cap |= 0x8086 << 16; /* vendor id */ + cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */ + cap |= 1; /* revision */ + cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */ + + cap &= 0xffffffff; + cap |= (FS_PER_S / HPET_FREQ) << 32; /* tick period in fs */ + + return (cap); +} + +static __inline bool +vhpet_counter_enabled(struct vhpet *vhpet) +{ + + return ((vhpet->config & HPET_CNF_ENABLE) ? true : false); +} + +static __inline bool +vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) +{ + const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; + + if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) + return (true); + else + return (false); +} + +static __inline int +vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) +{ + /* + * If the timer is configured to use MSI then treat it as if the + * timer is not connected to the ioapic. + */ + if (vhpet_timer_msi_enabled(vhpet, n)) + return (0); + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); +} + +static uint32_t +vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) +{ + uint32_t val; + sbintime_t now, delta; + + val = vhpet->countbase; + if (vhpet_counter_enabled(vhpet)) { + now = sbinuptime(); + delta = now - vhpet->countbase_sbt; +#ifdef __FreeBSD__ + KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: " + "%#lx to %#lx", vhpet->countbase_sbt, now)); +#else + KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: " + "%lx to %lx", vhpet->countbase_sbt, now)); +#endif + val += delta / vhpet->freq_sbt; + if (nowptr != NULL) + *nowptr = now; + } else { + /* + * The sbinuptime corresponding to the 'countbase' is + * meaningless when the counter is disabled. Make sure + * that the caller doesn't want to use it. + */ + KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL")); + } + return (val); +} + +static void +vhpet_timer_clear_isr(struct vhpet *vhpet, int n) +{ + int pin; + + if (vhpet->isr & (1 << n)) { + pin = vhpet_timer_ioapic_pin(vhpet, n); + KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); + vioapic_deassert_irq(vhpet->vm, pin); + vhpet->isr &= ~(1 << n); + } +} + +static __inline bool +vhpet_periodic_timer(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0); +} + +static __inline bool +vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0); +} + +static __inline bool +vhpet_timer_edge_trig(struct vhpet *vhpet, int n) +{ + + KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " + "timer %d is using MSI", n)); + + if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) + return (true); + else + return (false); +} + +static void +vhpet_timer_interrupt(struct vhpet *vhpet, int n) +{ + int pin; + + /* If interrupts are not enabled for this timer then just return. */ + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + return; + + /* + * If a level triggered interrupt is already asserted then just return. + */ + if ((vhpet->isr & (1 << n)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n); + return; + } + + if (vhpet_timer_msi_enabled(vhpet, n)) { + lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32, + vhpet->timer[n].msireg & 0xffffffff); + return; + } + + pin = vhpet_timer_ioapic_pin(vhpet, n); + if (pin == 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n); + return; + } + + if (vhpet_timer_edge_trig(vhpet, n)) { + vioapic_pulse_irq(vhpet->vm, pin); + } else { + vhpet->isr |= 1 << n; + vioapic_assert_irq(vhpet->vm, pin); + } +} + +static void +vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter) +{ + uint32_t compval, comprate, compnext; + + KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n)); + + compval = vhpet->timer[n].compval; + comprate = vhpet->timer[n].comprate; + + /* + * Calculate the comparator value to be used for the next periodic + * interrupt. + * + * This function is commonly called from the callout handler. + * In this scenario the 'counter' is ahead of 'compval'. To find + * the next value to program into the accumulator we divide the + * number space between 'compval' and 'counter' into 'comprate' + * sized units. The 'compval' is rounded up such that is "ahead" + * of 'counter'. + */ + compnext = compval + ((counter - compval) / comprate + 1) * comprate; + + vhpet->timer[n].compval = compnext; +} + +static void +vhpet_handler(void *a) +{ + int n; + uint32_t counter; + sbintime_t now; + struct vhpet *vhpet; + struct callout *callout; + struct vhpet_callout_arg *arg; + + arg = a; + vhpet = arg->vhpet; + n = arg->timer_num; + callout = &vhpet->timer[n].callout; + + VM_CTR1(vhpet->vm, "hpet t%d fired", n); + + VHPET_LOCK(vhpet); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (!vhpet_counter_enabled(vhpet)) + panic("vhpet(%p) callout with counter disabled", vhpet); + + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, n, counter, now); + vhpet_timer_interrupt(vhpet, n); +done: + VHPET_UNLOCK(vhpet); + return; +} + +static void +vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now) +{ + + VM_CTR1(vhpet->vm, "hpet t%d stopped", n); + callout_stop(&vhpet->timer[n].callout); + + /* + * If the callout was scheduled to expire in the past but hasn't + * had a chance to execute yet then trigger the timer interrupt + * here. Failing to do so will result in a missed timer interrupt + * in the guest. This is especially bad in one-shot mode because + * the next interrupt has to wait for the counter to wrap around. + */ + if (vhpet->timer[n].callout_sbt < now) { + VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after " + "stopping timer", n); + vhpet_timer_interrupt(vhpet, n); + } +} + +static void +vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now) +{ + sbintime_t delta, precision; + + if (vhpet->timer[n].comprate != 0) + vhpet_adjust_compval(vhpet, n, counter); + else { + /* + * In one-shot mode it is the guest's responsibility to make + * sure that the comparator value is not in the "past". The + * hardware doesn't have any belt-and-suspenders to deal with + * this so we don't either. + */ + } + + delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt; + precision = delta >> tc_precexp; + vhpet->timer[n].callout_sbt = now + delta; + callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt, + precision, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE); +} + +static void +vhpet_start_counting(struct vhpet *vhpet) +{ + int i; + + vhpet->countbase_sbt = sbinuptime(); + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + /* + * Restart the timers based on the value of the main counter + * when it stopped counting. + */ + vhpet_start_timer(vhpet, i, vhpet->countbase, + vhpet->countbase_sbt); + } +} + +static void +vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now) +{ + int i; + + vhpet->countbase = counter; + for (i = 0; i < VHPET_NUM_TIMERS; i++) + vhpet_stop_timer(vhpet, i, now); +} + +static __inline void +update_register(uint64_t *regptr, uint64_t data, uint64_t mask) +{ + + *regptr &= ~mask; + *regptr |= (data & mask); +} + +static void +vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data, + uint64_t mask) +{ + bool clear_isr; + int old_pin, new_pin; + uint32_t allowed_irqs; + uint64_t oldval, newval; + + if (vhpet_timer_msi_enabled(vhpet, n) || + vhpet_timer_edge_trig(vhpet, n)) { + if (vhpet->isr & (1 << n)) + panic("vhpet timer %d isr should not be asserted", n); + } + old_pin = vhpet_timer_ioapic_pin(vhpet, n); + oldval = vhpet->timer[n].cap_config; + + newval = oldval; + update_register(&newval, data, mask); + newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE); + newval |= oldval & HPET_TCAP_RO_MASK; + + if (newval == oldval) + return; + + vhpet->timer[n].cap_config = newval; + VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval); + + /* + * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field. + * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set + * it to the default value of 0. + */ + allowed_irqs = vhpet->timer[n].cap_config >> 32; + new_pin = vhpet_timer_ioapic_pin(vhpet, n); + if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) { + VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, " + "allowed_irqs 0x%08x", n, new_pin, allowed_irqs); + new_pin = 0; + vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE; + } + + if (!vhpet_periodic_timer(vhpet, n)) + vhpet->timer[n].comprate = 0; + + /* + * If the timer's ISR bit is set then clear it in the following cases: + * - interrupt is disabled + * - interrupt type is changed from level to edge or fsb. + * - interrupt routing is changed + * + * This is to ensure that this timer's level triggered interrupt does + * not remain asserted forever. + */ + if (vhpet->isr & (1 << n)) { + KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d", + n, old_pin)); + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_msi_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_edge_trig(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin) + clear_isr = true; + else + clear_isr = false; + + if (clear_isr) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to " + "configuration change", n); + vioapic_deassert_irq(vhpet->vm, old_pin); + vhpet->isr &= ~(1 << n); + } + } +} + +int +vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size, + void *arg) +{ + struct vhpet *vhpet; + uint64_t data, mask, oldval, val64; + uint32_t isr_clear_mask, old_compval, old_comprate, counter; + sbintime_t now, *nowptr; + int i, offset; + + vhpet = vm_hpet(vm); + offset = gpa - VHPET_BASE; + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + switch (size) { + case 8: + mask = 0xffffffffffffffff; + data = val; + break; + case 4: + mask = 0xffffffff; + data = val; + if ((offset & 0x4) != 0) { + mask <<= 32; + data <<= 32; + } + break; + default: + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + /* + * Get the most recent value of the counter before updating + * the 'config' register. If the HPET is going to be disabled + * then we need to update 'countbase' with the value right + * before it is disabled. + */ + nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL; + counter = vhpet_counter(vhpet, nowptr); + oldval = vhpet->config; + update_register(&vhpet->config, data, mask); + + /* + * LegacyReplacement Routing is not supported so clear the + * bit explicitly. + */ + vhpet->config &= ~HPET_CNF_LEG_RT; + + if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { + if (vhpet_counter_enabled(vhpet)) { + vhpet_start_counting(vhpet); + VM_CTR0(vhpet->vm, "hpet enabled"); + } else { + vhpet_stop_counting(vhpet, counter, now); + VM_CTR0(vhpet->vm, "hpet disabled"); + } + } + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + isr_clear_mask = vhpet->isr & data; + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if ((isr_clear_mask & (1 << i)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i); + vhpet_timer_clear_isr(vhpet, i); + } + } + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + /* Zero-extend the counter to 64-bits before updating it */ + val64 = vhpet_counter(vhpet, NULL); + update_register(&val64, data, mask); + vhpet->countbase = val64; + if (vhpet_counter_enabled(vhpet)) + vhpet_start_counting(vhpet); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + vhpet_timer_update_config(vhpet, i, data, mask); + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + old_compval = vhpet->timer[i].compval; + old_comprate = vhpet->timer[i].comprate; + if (vhpet_periodic_timer(vhpet, i)) { + /* + * In periodic mode writes to the comparator + * change the 'compval' register only if the + * HPET_TCNF_VAL_SET bit is set in the config + * register. + */ + val64 = vhpet->timer[i].comprate; + update_register(&val64, data, mask); + vhpet->timer[i].comprate = val64; + if ((vhpet->timer[i].cap_config & + HPET_TCNF_VAL_SET) != 0) { + vhpet->timer[i].compval = val64; + } + } else { + KASSERT(vhpet->timer[i].comprate == 0, + ("vhpet one-shot timer %d has invalid " + "rate %u", i, vhpet->timer[i].comprate)); + val64 = vhpet->timer[i].compval; + update_register(&val64, data, mask); + vhpet->timer[i].compval = val64; + } + vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET; + + if (vhpet->timer[i].compval != old_compval || + vhpet->timer[i].comprate != old_comprate) { + if (vhpet_counter_enabled(vhpet)) { + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, i, counter, + now); + } + } + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + update_register(&vhpet->timer[i].msireg, data, mask); + break; + } + } +done: + VHPET_UNLOCK(vhpet); + return (0); +} + +int +vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int i, offset; + struct vhpet *vhpet; + uint64_t data; + + vhpet = vm_hpet(vm); + offset = gpa - VHPET_BASE; + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + if (size != 4 && size != 8) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) { + data = vhpet_capabilities(); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + data = vhpet->config; + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + data = vhpet->isr; + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + data = vhpet_counter(vhpet, NULL); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + data = vhpet->timer[i].cap_config; + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + data = vhpet->timer[i].compval; + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + data = vhpet->timer[i].msireg; + break; + } + } + + if (i >= VHPET_NUM_TIMERS) + data = 0; +done: + VHPET_UNLOCK(vhpet); + + if (size == 4) { + if (offset & 0x4) + data >>= 32; + } + *rval = data; + return (0); +} + +struct vhpet * +vhpet_init(struct vm *vm) +{ + int i, pincount; + struct vhpet *vhpet; + uint64_t allowed_irqs; + struct vhpet_callout_arg *arg; + struct bintime bt; + + vhpet = malloc(sizeof(struct vhpet), M_VHPET, M_WAITOK | M_ZERO); + vhpet->vm = vm; + mtx_init(&vhpet->mtx, "vhpet lock", NULL, MTX_DEF); + + FREQ2BT(HPET_FREQ, &bt); + vhpet->freq_sbt = bttosbt(bt); + + pincount = vioapic_pincount(vm); + if (pincount >= 32) + allowed_irqs = 0xff000000; /* irqs 24-31 */ + else if (pincount >= 20) + allowed_irqs = 0xf << (pincount - 4); /* 4 upper irqs */ + else + allowed_irqs = 0; + + /* + * Initialize HPET timer hardware state. + */ + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + vhpet->timer[i].cap_config = allowed_irqs << 32; + vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT; + vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL; + + vhpet->timer[i].compval = 0xffffffff; + callout_init(&vhpet->timer[i].callout, 1); + + arg = &vhpet->timer[i].arg; + arg->vhpet = vhpet; + arg->timer_num = i; + } + + return (vhpet); +} + +void +vhpet_cleanup(struct vhpet *vhpet) +{ + int i; + + for (i = 0; i < VHPET_NUM_TIMERS; i++) + callout_drain(&vhpet->timer[i].callout); + + free(vhpet, M_VHPET); +} + +int +vhpet_getcap(struct vm_hpet_cap *cap) +{ + + cap->capabilities = vhpet_capabilities(); + return (0); +} +#ifndef __FreeBSD__ +void +vhpet_localize_resources(struct vhpet *vhpet) +{ + for (uint_t i = 0; i < VHPET_NUM_TIMERS; i++) { + vmm_glue_callout_localize(&vhpet->timer[i].callout); + } +} +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.h b/usr/src/uts/i86pc/io/vmm/io/vhpet.h new file mode 100644 index 0000000000..8e28241b32 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.h @@ -0,0 +1,54 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VHPET_H_ +#define _VHPET_H_ + +#define VHPET_BASE 0xfed00000 +#define VHPET_SIZE 1024 + +struct vhpet *vhpet_init(struct vm *vm); +void vhpet_cleanup(struct vhpet *vhpet); +int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, + int size, void *arg); +int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val, + int size, void *arg); +int vhpet_getcap(struct vm_hpet_cap *cap); + +#ifndef __FreeBSD__ +void vhpet_localize_resources(struct vhpet *vhpet); +#endif + +#endif /* _VHPET_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.c b/usr/src/uts/i86pc/io/vmm/io/vioapic.c new file mode 100644 index 0000000000..dbd3420420 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.c @@ -0,0 +1,602 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/cpuset.h> + +#include <x86/apicreg.h> +#include <machine/vmm.h> + +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vlapic.h" +#include "vioapic.h" + +#define IOREGSEL 0x00 +#define IOWIN 0x10 + +#define REDIR_ENTRIES 32 +#define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS)) + +struct vioapic { + struct vm *vm; + struct mtx mtx; + uint32_t id; + uint32_t ioregsel; + struct { + uint64_t reg; + int acnt; /* sum of pin asserts (+1) and deasserts (-1) */ + } rtbl[REDIR_ENTRIES]; +}; + +#define VIOAPIC_LOCK(vioapic) mtx_lock_spin(&((vioapic)->mtx)) +#define VIOAPIC_UNLOCK(vioapic) mtx_unlock_spin(&((vioapic)->mtx)) +#define VIOAPIC_LOCKED(vioapic) mtx_owned(&((vioapic)->mtx)) + +static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic"); + +#define VIOAPIC_CTR1(vioapic, fmt, a1) \ + VM_CTR1((vioapic)->vm, fmt, a1) + +#define VIOAPIC_CTR2(vioapic, fmt, a1, a2) \ + VM_CTR2((vioapic)->vm, fmt, a1, a2) + +#define VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3) \ + VM_CTR3((vioapic)->vm, fmt, a1, a2, a3) + +#define VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4) + +#ifdef KTR +static const char * +pinstate_str(bool asserted) +{ + + if (asserted) + return ("asserted"); + else + return ("deasserted"); +} +#endif + +static void +vioapic_send_intr(struct vioapic *vioapic, int pin) +{ + int vector, delmode; + uint32_t low, high, dest; + bool level, phys; + + KASSERT(pin >= 0 && pin < REDIR_ENTRIES, + ("vioapic_set_pinstate: invalid pin number %d", pin)); + + KASSERT(VIOAPIC_LOCKED(vioapic), + ("vioapic_set_pinstate: vioapic is not locked")); + + low = vioapic->rtbl[pin].reg; + high = vioapic->rtbl[pin].reg >> 32; + + if ((low & IOART_INTMASK) == IOART_INTMSET) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin); + return; + } + + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + level = low & IOART_TRGRLVL ? true : false; + if (level) + vioapic->rtbl[pin].reg |= IOART_REM_IRR; + + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector); +} + +static void +vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate) +{ + int oldcnt, newcnt; + bool needintr; + + KASSERT(pin >= 0 && pin < REDIR_ENTRIES, + ("vioapic_set_pinstate: invalid pin number %d", pin)); + + KASSERT(VIOAPIC_LOCKED(vioapic), + ("vioapic_set_pinstate: vioapic is not locked")); + + oldcnt = vioapic->rtbl[pin].acnt; + if (newstate) + vioapic->rtbl[pin].acnt++; + else + vioapic->rtbl[pin].acnt--; + newcnt = vioapic->rtbl[pin].acnt; + + if (newcnt < 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d", + pin, newcnt); + } + + needintr = false; + if (oldcnt == 0 && newcnt == 1) { + needintr = true; + VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin); + } else if (oldcnt == 1 && newcnt == 0) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin); + } else { + VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d", + pin, pinstate_str(newstate), newcnt); + } + + if (needintr) + vioapic_send_intr(vioapic, pin); +} + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +static int +vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vioapic *vioapic; + + if (irq < 0 || irq >= REDIR_ENTRIES) + return (EINVAL); + + vioapic = vm_ioapic(vm); + + VIOAPIC_LOCK(vioapic); + switch (irqstate) { + case IRQSTATE_ASSERT: + vioapic_set_pinstate(vioapic, irq, true); + break; + case IRQSTATE_DEASSERT: + vioapic_set_pinstate(vioapic, irq, false); + break; + case IRQSTATE_PULSE: + vioapic_set_pinstate(vioapic, irq, true); + vioapic_set_pinstate(vioapic, irq, false); + break; + default: + panic("vioapic_set_irqstate: invalid irqstate %d", irqstate); + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_assert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vioapic_deassert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vioapic_pulse_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +#define REDIR_IS_PHYS(reg) (((reg) & IOART_DESTMOD) == IOART_DESTPHY) +#define REDIR_IS_LOWPRIO(reg) (((reg) & IOART_DELMOD) == IOART_DELLOPRI) +/* Level-triggered interrupts only valid in fixed and low-priority modes */ +#define REDIR_IS_LVLTRIG(reg) \ + (((reg) & IOART_TRGRLVL) != 0 && \ + (((reg) & IOART_DELMOD) == IOART_DELFIXED || REDIR_IS_LOWPRIO(reg))) +#define REDIR_DEST(reg) ((reg) >> (32 + APIC_ID_SHIFT)) +#define REDIR_VECTOR(reg) ((reg) & IOART_INTVEC) + +/* + * Given a redirection entry, determine which vCPUs would be targeted. + */ +static void +vioapic_calcdest(struct vioapic *vioapic, uint64_t redir_ent, cpuset_t *dmask) +{ + + /* + * When calculating interrupt destinations with vlapic_calcdest(), the + * legacy xAPIC format is assumed, since the system lacks interrupt + * redirection hardware. + * See vlapic_deliver_intr() for more details. + */ + vlapic_calcdest(vioapic->vm, dmask, REDIR_DEST(redir_ent), + REDIR_IS_PHYS(redir_ent), REDIR_IS_LOWPRIO(redir_ent), false); +} + +/* + * Across all redirection entries utilizing a specified vector, determine the + * set of vCPUs which would be targeted by a level-triggered interrupt. + */ +static void +vioapic_tmr_active(struct vioapic *vioapic, uint8_t vec, cpuset_t *result) +{ + u_int i; + + CPU_ZERO(result); + if (vec == 0) { + return; + } + + for (i = 0; i < REDIR_ENTRIES; i++) { + cpuset_t dest; + const uint64_t val = vioapic->rtbl[i].reg; + + if (!REDIR_IS_LVLTRIG(val) || REDIR_VECTOR(val) != vec) { + continue; + } + + CPU_ZERO(&dest); + vioapic_calcdest(vioapic, val, &dest); + CPU_OR(result, &dest); + } +} + +/* + * Update TMR state in vLAPICs after changes to vIOAPIC pin configuration + */ +static void +vioapic_update_tmrs(struct vioapic *vioapic, int vcpuid, uint64_t oldval, + uint64_t newval) +{ + cpuset_t active, allset, newset, oldset; + struct vm *vm; + uint8_t newvec, oldvec; + + vm = vioapic->vm; + CPU_ZERO(&allset); + CPU_ZERO(&newset); + CPU_ZERO(&oldset); + newvec = oldvec = 0; + + if (REDIR_IS_LVLTRIG(oldval)) { + vioapic_calcdest(vioapic, oldval, &oldset); + CPU_OR(&allset, &oldset); + oldvec = REDIR_VECTOR(oldval); + } + + if (REDIR_IS_LVLTRIG(newval)) { + vioapic_calcdest(vioapic, newval, &newset); + CPU_OR(&allset, &newset); + newvec = REDIR_VECTOR(newval); + } + + if (CPU_EMPTY(&allset) || + (CPU_CMP(&oldset, &newset) == 0 && oldvec == newvec)) { + return; + } + + /* + * Since the write to the redirection table has already occurred, a + * scan of level-triggered entries referencing the old vector will find + * only entries which are now currently valid. + */ + vioapic_tmr_active(vioapic, oldvec, &active); + + while (!CPU_EMPTY(&allset)) { + struct vlapic *vlapic; + u_int i; + + i = CPU_FFS(&allset) - 1; + CPU_CLR(i, &allset); + + if (oldvec == newvec && + CPU_ISSET(i, &oldset) && CPU_ISSET(i, &newset)) { + continue; + } + + if (i != vcpuid) { + vcpu_block_run(vm, i); + } + + vlapic = vm_lapic(vm, i); + if (CPU_ISSET(i, &oldset)) { + /* + * Perform the deassertion if no other level-triggered + * IOAPIC entries target this vCPU with the old vector + * + * Note: Sharing of vectors like that should be + * extremely rare in modern operating systems and was + * previously unsupported by the bhyve vIOAPIC. + */ + if (!CPU_ISSET(i, &active)) { + vlapic_tmr_set(vlapic, oldvec, false); + } + } + if (CPU_ISSET(i, &newset)) { + vlapic_tmr_set(vlapic, newvec, true); + } + + if (i != vcpuid) { + vcpu_unblock_run(vm, i); + } + } +} + +static uint32_t +vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr) +{ + int regnum, pin, rshift; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + return (vioapic->id); + break; + case IOAPIC_VER: + return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11); + break; + case IOAPIC_ARB: + return (vioapic->id); + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + rshift = 32; + else + rshift = 0; + + return (vioapic->rtbl[pin].reg >> rshift); + } + + return (0); +} + +static void +vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) +{ + uint64_t data64, mask64; + uint64_t last, changed; + int regnum, pin, lshift; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + vioapic->id = data & APIC_ID_MASK; + break; + case IOAPIC_VER: + case IOAPIC_ARB: + /* readonly */ + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + lshift = 32; + else + lshift = 0; + + last = vioapic->rtbl[pin].reg; + + data64 = (uint64_t)data << lshift; + mask64 = (uint64_t)0xffffffff << lshift; + vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS; + vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS; + + VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx", + pin, vioapic->rtbl[pin].reg); + + /* + * If any fields in the redirection table entry (except mask + * or polarity) have changed then update the trigger-mode + * registers on all the vlapics. + */ + changed = last ^ vioapic->rtbl[pin].reg; + if (changed & ~(IOART_INTMASK | IOART_INTPOL)) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " + "vlapic trigger-mode register", pin); + vioapic_update_tmrs(vioapic, vcpuid, last, + vioapic->rtbl[pin].reg); + } + + /* + * Generate an interrupt if the following conditions are met: + * - pin is not masked + * - previous interrupt has been EOIed + * - pin level is asserted + */ + if ((vioapic->rtbl[pin].reg & IOART_INTMASK) == IOART_INTMCLR && + (vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0 && + (vioapic->rtbl[pin].acnt > 0)) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl " + "write, acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } +} + +static int +vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa, + uint64_t *data, int size, bool doread) +{ + uint64_t offset; + + offset = gpa - VIOAPIC_BASE; + + /* + * The IOAPIC specification allows 32-bit wide accesses to the + * IOREGSEL (offset 0) and IOWIN (offset 16) registers. + */ + if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) { + if (doread) + *data = 0; + return (0); + } + + VIOAPIC_LOCK(vioapic); + if (offset == IOREGSEL) { + if (doread) + *data = vioapic->ioregsel; + else + vioapic->ioregsel = *data; + } else { + if (doread) { + *data = vioapic_read(vioapic, vcpuid, + vioapic->ioregsel); + } else { + vioapic_write(vioapic, vcpuid, vioapic->ioregsel, + *data); + } + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, + int size, void *arg) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vm); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true); + return (error); +} + +int +vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval, + int size, void *arg) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vm); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false); + return (error); +} + +void +vioapic_process_eoi(struct vm *vm, int vcpuid, int vector) +{ + struct vioapic *vioapic; + int pin; + + KASSERT(vector >= 0 && vector < 256, + ("vioapic_process_eoi: invalid vector %d", vector)); + + vioapic = vm_ioapic(vm); + VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector); + + /* + * XXX keep track of the pins associated with this vector instead + * of iterating on every single pin each time. + */ + VIOAPIC_LOCK(vioapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0) + continue; + if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector) + continue; + vioapic->rtbl[pin].reg &= ~IOART_REM_IRR; + if (vioapic->rtbl[pin].acnt > 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, " + "acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } + VIOAPIC_UNLOCK(vioapic); +} + +struct vioapic * +vioapic_init(struct vm *vm) +{ + int i; + struct vioapic *vioapic; + + vioapic = malloc(sizeof(struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO); + + vioapic->vm = vm; + mtx_init(&vioapic->mtx, "vioapic lock", NULL, MTX_SPIN); + + /* Initialize all redirection entries to mask all interrupts */ + for (i = 0; i < REDIR_ENTRIES; i++) + vioapic->rtbl[i].reg = 0x0001000000010000UL; + + return (vioapic); +} + +void +vioapic_cleanup(struct vioapic *vioapic) +{ + + free(vioapic, M_VIOAPIC); +} + +int +vioapic_pincount(struct vm *vm) +{ + + return (REDIR_ENTRIES); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.h b/usr/src/uts/i86pc/io/vmm/io/vioapic.h new file mode 100644 index 0000000000..6bf3e80e05 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.h @@ -0,0 +1,64 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _VIOAPIC_H_ +#define _VIOAPIC_H_ + +#define VIOAPIC_BASE 0xFEC00000 +#define VIOAPIC_SIZE 4096 + +struct vioapic *vioapic_init(struct vm *vm); +void vioapic_cleanup(struct vioapic *vioapic); + +int vioapic_assert_irq(struct vm *vm, int irq); +int vioapic_deassert_irq(struct vm *vm, int irq); +int vioapic_pulse_irq(struct vm *vm, int irq); + +int vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, + uint64_t wval, int size, void *arg); +int vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, + uint64_t *rval, int size, void *arg); + +int vioapic_pincount(struct vm *vm); +void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c new file mode 100644 index 0000000000..4e58249c8d --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c @@ -0,0 +1,1705 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/smp.h> + +#include <x86/specialreg.h> +#include <x86/apicreg.h> + +#include <machine/clock.h> +#include <machine/smp.h> + +#include <machine/vmm.h> + +#include "vmm_lapic.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" + +#include "vlapic.h" +#include "vlapic_priv.h" +#include "vioapic.h" + +#define PRIO(x) ((x) >> 4) + +#define VLAPIC_VERSION (16) + +#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) + +/* + * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the + * vlapic_callout_handler() and vcpu accesses to: + * - timer_freq_bt, timer_period_bt, timer_fire_bt + * - timer LVT register + */ +#define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) + +/* + * APIC timer frequency: + * - arbitrary but chosen to be in the ballpark of contemporary hardware. + * - power-of-two to avoid loss of precision when converted to a bintime. + */ +#define VLAPIC_BUS_FREQ (128 * 1024 * 1024) + +static void vlapic_set_error(struct vlapic *, uint32_t, bool); +static void vlapic_tmr_reset(struct vlapic *); + +static __inline uint32_t +vlapic_get_id(struct vlapic *vlapic) +{ + + if (x2apic(vlapic)) + return (vlapic->vcpuid); + else + return (vlapic->vcpuid << 24); +} + +static uint32_t +x2apic_ldr(struct vlapic *vlapic) +{ + int apicid; + uint32_t ldr; + + apicid = vlapic_get_id(vlapic); + ldr = 1 << (apicid & 0xf); + ldr |= (apicid & 0xffff0) << 12; + return (ldr); +} + +void +vlapic_dfr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + if (x2apic(vlapic)) { + VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", + lapic->dfr); + lapic->dfr = 0; + return; + } + + lapic->dfr &= APIC_DFR_MODEL_MASK; + lapic->dfr |= APIC_DFR_RESERVED; + + if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) + VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); + else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) + VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); + else + VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); +} + +void +vlapic_ldr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + + /* LDR is read-only in x2apic mode */ + if (x2apic(vlapic)) { + VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", + lapic->ldr); + lapic->ldr = x2apic_ldr(vlapic); + } else { + lapic->ldr &= ~APIC_LDR_RESERVED; + VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); + } +} + +void +vlapic_id_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + /* + * We don't allow the ID register to be modified so reset it back to + * its default value. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); +} + +static int +vlapic_timer_divisor(uint32_t dcr) +{ + switch (dcr & 0xB) { + case APIC_TDCR_1: + return (1); + case APIC_TDCR_2: + return (2); + case APIC_TDCR_4: + return (4); + case APIC_TDCR_8: + return (8); + case APIC_TDCR_16: + return (16); + case APIC_TDCR_32: + return (32); + case APIC_TDCR_64: + return (64); + case APIC_TDCR_128: + return (128); + default: + panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); + } +} + +#if 0 +static inline void +vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) +{ + printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, + *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, + *lvt & APIC_LVTT_M); +} +#endif + +static uint32_t +vlapic_get_ccr(struct vlapic *vlapic) +{ + struct bintime bt_now, bt_rem; + struct LAPIC *lapic; + uint32_t ccr; + + ccr = 0; + lapic = vlapic->apic_page; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_active(&vlapic->callout)) { + /* + * If the timer is scheduled to expire in the future then + * compute the value of 'ccr' based on the remaining time. + */ + binuptime(&bt_now); + if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { + bt_rem = vlapic->timer_fire_bt; + bintime_sub(&bt_rem, &bt_now); + ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); + ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; + } + } +#ifdef __FreeBSD__ + KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " + "icr_timer is %#x", ccr, lapic->icr_timer)); +#else + KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %x, " + "icr_timer is %x", ccr, lapic->icr_timer)); +#endif + VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", + ccr, lapic->icr_timer); + VLAPIC_TIMER_UNLOCK(vlapic); + return (ccr); +} + +void +vlapic_dcr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + int divisor; + + lapic = vlapic->apic_page; + VLAPIC_TIMER_LOCK(vlapic); + + divisor = vlapic_timer_divisor(lapic->dcr_timer); + VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", + lapic->dcr_timer, divisor); + + /* + * Update the timer frequency and the timer period. + * + * XXX changes to the frequency divider will not take effect until + * the timer is reloaded. + */ + FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_esr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->esr = vlapic->esr_pending; + vlapic->esr_pending = 0; +} + +int +vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *irrptr, *tmrptr, mask; + int idx; + + KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); + + lapic = vlapic->apic_page; + if (!(lapic->svr & APIC_SVR_ENABLE)) { + VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " + "interrupt %d", vector); + return (0); + } + + if (vector < 16) { + vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR, + false); + VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", + vector); + return (1); + } + + if (vlapic->ops.set_intr_ready) + return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); + + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + + irrptr = &lapic->irr0; + atomic_set_int(&irrptr[idx], mask); + + /* + * Verify that the trigger-mode of the interrupt matches with + * the vlapic TMR registers. + */ + tmrptr = &lapic->tmr0; + if ((tmrptr[idx] & mask) != (level ? mask : 0)) { + VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " + "interrupt is %s-triggered", idx / 4, tmrptr[idx], + level ? "level" : "edge"); + } + + VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); + return (1); +} + +static __inline uint32_t * +vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) +{ + struct LAPIC *lapic = vlapic->apic_page; + int i; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + return (&lapic->lvt_cmci); + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; + return ((&lapic->lvt_timer) + i);; + default: + panic("vlapic_get_lvt: invalid LVT\n"); + } +} + +static __inline int +lvt_off_to_idx(uint32_t offset) +{ + int index; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + index = APIC_LVT_CMCI; + break; + case APIC_OFFSET_TIMER_LVT: + index = APIC_LVT_TIMER; + break; + case APIC_OFFSET_THERM_LVT: + index = APIC_LVT_THERMAL; + break; + case APIC_OFFSET_PERF_LVT: + index = APIC_LVT_PMC; + break; + case APIC_OFFSET_LINT0_LVT: + index = APIC_LVT_LINT0; + break; + case APIC_OFFSET_LINT1_LVT: + index = APIC_LVT_LINT1; + break; + case APIC_OFFSET_ERROR_LVT: + index = APIC_LVT_ERROR; + break; + default: + index = -1; + break; + } +#ifdef __FreeBSD__ + KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " + "invalid lvt index %d for offset %#x", index, offset)); +#else + KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " + "invalid lvt index %d for offset %x", index, offset)); +#endif + + return (index); +} + +static __inline uint32_t +vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +{ + int idx; + uint32_t val; + + idx = lvt_off_to_idx(offset); + val = atomic_load_acq_32(&vlapic->lvt_last[idx]); + return (val); +} + +void +vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) +{ + uint32_t *lvtptr, mask, val; + struct LAPIC *lapic; + int idx; + + lapic = vlapic->apic_page; + lvtptr = vlapic_get_lvtptr(vlapic, offset); + val = *lvtptr; + idx = lvt_off_to_idx(offset); + + if (!(lapic->svr & APIC_SVR_ENABLE)) + val |= APIC_LVT_M; + mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; + switch (offset) { + case APIC_OFFSET_TIMER_LVT: + mask |= APIC_LVTT_TM; + break; + case APIC_OFFSET_ERROR_LVT: + break; + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; + /* FALLTHROUGH */ + default: + mask |= APIC_LVT_DM; + break; + } + val &= mask; + *lvtptr = val; + atomic_store_rel_32(&vlapic->lvt_last[idx], val); +} + +static void +vlapic_mask_lvts(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + lapic->lvt_cmci |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); + + lapic->lvt_timer |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); + + lapic->lvt_thermal |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); + + lapic->lvt_pcint |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); + + lapic->lvt_lint0 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); + + lapic->lvt_lint1 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); + + lapic->lvt_error |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); +} + +static int +vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt) +{ + uint32_t mode, reg, vec; + + reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]); + + if (reg & APIC_LVT_M) + return (0); + vec = reg & APIC_LVT_VECTOR; + mode = reg & APIC_LVT_DM; + + switch (mode) { + case APIC_LVT_DM_FIXED: + if (vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, + lvt == APIC_LVT_ERROR); + return (0); + } + if (vlapic_set_intr_ready(vlapic, vec, false)) + vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true); + break; + case APIC_LVT_DM_NMI: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_DM_EXTINT: + vm_inject_extint(vlapic->vm, vlapic->vcpuid); + break; + default: + // Other modes ignored + return (0); + } + return (1); +} + +#if 1 +static void +dump_isrvec_stk(struct vlapic *vlapic) +{ + int i; + uint32_t *isrptr; + + isrptr = &vlapic->apic_page->isr0; + for (i = 0; i < 8; i++) + printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); + + for (i = 0; i <= vlapic->isrvec_stk_top; i++) + printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); +} +#endif + +/* + * Algorithm adopted from section "Interrupt, Task and Processor Priority" + * in Intel Architecture Manual Vol 3a. + */ +static void +vlapic_update_ppr(struct vlapic *vlapic) +{ + int isrvec, tpr, ppr; + + /* + * Note that the value on the stack at index 0 is always 0. + * + * This is a placeholder for the value of ISRV when none of the + * bits is set in the ISRx registers. + */ + isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; + tpr = vlapic->apic_page->tpr; + +#if 1 + { + int i, lastprio, curprio, vector, idx; + uint32_t *isrptr; + + if (vlapic->isrvec_stk_top == 0 && isrvec != 0) + panic("isrvec_stk is corrupted: %d", isrvec); + + /* + * Make sure that the priority of the nested interrupts is + * always increasing. + */ + lastprio = -1; + for (i = 1; i <= vlapic->isrvec_stk_top; i++) { + curprio = PRIO(vlapic->isrvec_stk[i]); + if (curprio <= lastprio) { + dump_isrvec_stk(vlapic); + panic("isrvec_stk does not satisfy invariant"); + } + lastprio = curprio; + } + + /* + * Make sure that each bit set in the ISRx registers has a + * corresponding entry on the isrvec stack. + */ + i = 1; + isrptr = &vlapic->apic_page->isr0; + for (vector = 0; vector < 256; vector++) { + idx = (vector / 32) * 4; + if (isrptr[idx] & (1 << (vector % 32))) { + if (i > vlapic->isrvec_stk_top || + vlapic->isrvec_stk[i] != vector) { + dump_isrvec_stk(vlapic); + panic("ISR and isrvec_stk out of sync"); + } + i++; + } + } + } +#endif + + if (PRIO(tpr) >= PRIO(isrvec)) + ppr = tpr; + else + ppr = isrvec & 0xf0; + + vlapic->apic_page->ppr = ppr; + VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); +} + +static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); + +static void +vlapic_process_eoi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *isrptr, *tmrptr; + int i, idx, bitpos, vector; + + isrptr = &lapic->isr0; + tmrptr = &lapic->tmr0; + + for (i = 7; i >= 0; i--) { + idx = i * 4; + bitpos = fls(isrptr[idx]); + if (bitpos-- != 0) { + if (vlapic->isrvec_stk_top <= 0) { + panic("invalid vlapic isrvec_stk_top %d", + vlapic->isrvec_stk_top); + } + isrptr[idx] &= ~(1 << bitpos); + vector = i * 32 + bitpos; + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d", + vector); + VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); + vlapic->isrvec_stk_top--; + vlapic_update_ppr(vlapic); + if ((tmrptr[idx] & (1 << bitpos)) != 0) { + vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, + vector); + } + return; + } + } + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1); +} + +static __inline int +vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) +{ + + return (lvt & mask); +} + +static __inline int +vlapic_periodic_timer(struct vlapic *vlapic) +{ + uint32_t lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); +} + +static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); + +static void +vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error) +{ + + vlapic->esr_pending |= mask; + + /* + * Avoid infinite recursion if the error LVT itself is configured with + * an illegal vector. + */ + if (lvt_error) + return; + + if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); + } +} + +static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); + +static void +vlapic_fire_timer(struct vlapic *vlapic) +{ + + KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); + + if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) { + VLAPIC_CTR0(vlapic, "vlapic timer fired"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); + } +} + +static VMM_STAT(VLAPIC_INTR_CMC, + "corrected machine check interrupts generated by vlapic"); + +void +vlapic_fire_cmci(struct vlapic *vlapic) +{ + + if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); + } +} + +static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, + "lvts triggered"); + +int +vlapic_trigger_lvt(struct vlapic *vlapic, int vector) +{ + + if (vlapic_enabled(vlapic) == false) { + /* + * When the local APIC is global/hardware disabled, + * LINT[1:0] pins are configured as INTR and NMI pins, + * respectively. + */ + switch (vector) { + case APIC_LVT_LINT0: + vm_inject_extint(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_LINT1: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + default: + break; + } + return (0); + } + + switch (vector) { + case APIC_LVT_LINT0: + case APIC_LVT_LINT1: + case APIC_LVT_TIMER: + case APIC_LVT_ERROR: + case APIC_LVT_PMC: + case APIC_LVT_THERMAL: + case APIC_LVT_CMCI: + if (vlapic_fire_lvt(vlapic, vector)) { + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + LVTS_TRIGGERRED, vector, 1); + } + break; + default: + return (EINVAL); + } + return (0); +} + +static void +vlapic_callout_handler(void *arg) +{ + struct vlapic *vlapic; + struct bintime bt, btnow; + sbintime_t rem_sbt; + + vlapic = arg; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_pending(&vlapic->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vlapic->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vlapic->callout); + + vlapic_fire_timer(vlapic); + + if (vlapic_periodic_timer(vlapic)) { + binuptime(&btnow); +#ifdef __FreeBSD__ + KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), + ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", + btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, + vlapic->timer_fire_bt.frac)); +#else + KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), + ("vlapic callout at %lx.%lx, expected at %lx.%lx", + btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, + vlapic->timer_fire_bt.frac)); +#endif + + /* + * Compute the delta between when the timer was supposed to + * fire and the present time. + */ + bt = btnow; + bintime_sub(&bt, &vlapic->timer_fire_bt); + + rem_sbt = bttosbt(vlapic->timer_period_bt); + if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { + /* + * Adjust the time until the next countdown downward + * to account for the lost time. + */ + rem_sbt -= bttosbt(bt); + } else { + /* + * If the delta is greater than the timer period then + * just reset our time base instead of trying to catch + * up. + */ + vlapic->timer_fire_bt = btnow; + VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " + "usecs, period is %lu usecs - resetting time base", + bttosbt(bt) / SBT_1US, + bttosbt(vlapic->timer_period_bt) / SBT_1US); + } + + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, rem_sbt, 0, + vlapic_callout_handler, vlapic, 0); + } +done: + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_icrtmr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + sbintime_t sbt; + uint32_t icr_timer; + + VLAPIC_TIMER_LOCK(vlapic); + + lapic = vlapic->apic_page; + icr_timer = lapic->icr_timer; + + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, icr_timer); + + if (icr_timer != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + + sbt = bttosbt(vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } else + callout_stop(&vlapic->callout); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +/* + * This function populates 'dmask' with the set of vcpus that match the + * addressing specified by the (dest, phys, lowprio) tuple. + * + * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) + * or xAPIC (8-bit) destination field. + */ +void +vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest) +{ + struct vlapic *vlapic; + uint32_t dfr, ldr, ldest, cluster; + uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; + cpuset_t amask; + int vcpuid; + + if ((x2apic_dest && dest == 0xffffffff) || + (!x2apic_dest && dest == 0xff)) { + /* + * Broadcast in both logical and physical modes. + */ + *dmask = vm_active_cpus(vm); + return; + } + + if (phys) { + /* + * Physical mode: destination is APIC ID. + */ + CPU_ZERO(dmask); + vcpuid = vm_apicid2vcpuid(vm, dest); + if (vcpuid < vm_get_maxcpus(vm)) + CPU_SET(vcpuid, dmask); + } else { + /* + * In the "Flat Model" the MDA is interpreted as an 8-bit wide + * bitmask. This model is only available in the xAPIC mode. + */ + mda_flat_ldest = dest & 0xff; + + /* + * In the "Cluster Model" the MDA is used to identify a + * specific cluster and a set of APICs in that cluster. + */ + if (x2apic_dest) { + mda_cluster_id = dest >> 16; + mda_cluster_ldest = dest & 0xffff; + } else { + mda_cluster_id = (dest >> 4) & 0xf; + mda_cluster_ldest = dest & 0xf; + } + + /* + * Logical mode: match each APIC that has a bit set + * in its LDR that matches a bit in the ldest. + */ + CPU_ZERO(dmask); + amask = vm_active_cpus(vm); + while ((vcpuid = CPU_FFS(&amask)) != 0) { + vcpuid--; + CPU_CLR(vcpuid, &amask); + + vlapic = vm_lapic(vm, vcpuid); + dfr = vlapic->apic_page->dfr; + ldr = vlapic->apic_page->ldr; + + if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_FLAT) { + ldest = ldr >> 24; + mda_ldest = mda_flat_ldest; + } else if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_CLUSTER) { + if (x2apic(vlapic)) { + cluster = ldr >> 16; + ldest = ldr & 0xffff; + } else { + cluster = ldr >> 28; + ldest = (ldr >> 24) & 0xf; + } + if (cluster != mda_cluster_id) + continue; + mda_ldest = mda_cluster_ldest; + } else { + /* + * Guest has configured a bad logical + * model for this vcpu - skip it. + */ + VLAPIC_CTR1(vlapic, "vlapic has bad logical " + "model %x - cannot deliver interrupt", dfr); + continue; + } + + if ((mda_ldest & ldest) != 0) { + CPU_SET(vcpuid, dmask); + if (lowprio) + break; + } + } + } +} + +static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); + +static void +vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if (lapic->tpr != val) { + VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed " + "from %#x to %#x", lapic->tpr, val); + lapic->tpr = val; + vlapic_update_ppr(vlapic); + } +} + +static uint8_t +vlapic_get_tpr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + return (lapic->tpr); +} + +void +vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) +{ + uint8_t tpr; + + if (val & ~0xf) { + vm_inject_gp(vlapic->vm, vlapic->vcpuid); + return; + } + + tpr = val << 4; + vlapic_set_tpr(vlapic, tpr); +} + +uint64_t +vlapic_get_cr8(struct vlapic *vlapic) +{ + uint8_t tpr; + + tpr = vlapic_get_tpr(vlapic); + return (tpr >> 4); +} + +int +vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) +{ + int i; + bool phys; + cpuset_t dmask; + uint64_t icrval; + uint32_t dest, vec, mode; + struct vlapic *vlapic2; + struct vm_exit *vmexit; + struct LAPIC *lapic; + uint16_t maxcpus; + + lapic = vlapic->apic_page; + lapic->icr_lo &= ~APIC_DELSTAT_PEND; + icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; + + if (x2apic(vlapic)) + dest = icrval >> 32; + else + dest = icrval >> (32 + 24); + vec = icrval & APIC_VECTOR_MASK; + mode = icrval & APIC_DELMODE_MASK; + + if (mode == APIC_DELMODE_FIXED && vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); + VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); + return (0); + } + + VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); + + if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { + switch (icrval & APIC_DEST_MASK) { + case APIC_DEST_DESTFLD: + phys = ((icrval & APIC_DESTMODE_LOG) == 0); + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, + x2apic(vlapic)); + break; + case APIC_DEST_SELF: + CPU_SETOF(vlapic->vcpuid, &dmask); + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm); + CPU_CLR(vlapic->vcpuid, &dmask); + break; + default: + CPU_ZERO(&dmask); /* satisfy gcc */ + break; + } + + while ((i = CPU_FFS(&dmask)) != 0) { + i--; + CPU_CLR(i, &dmask); + if (mode == APIC_DELMODE_FIXED) { + lapic_intr_edge(vlapic->vm, i, vec); + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + IPIS_SENT, i, 1); + VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " + "to vcpuid %d", vec, i); + } else { + vm_inject_nmi(vlapic->vm, i); + VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " + "to vcpuid %d", i); + } + } + + return (0); /* handled completely in the kernel */ + } + + maxcpus = vm_get_maxcpus(vlapic->vm); + if (mode == APIC_DELMODE_INIT) { + if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) + return (0); + + if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { + vlapic2 = vm_lapic(vlapic->vm, dest); + + /* move from INIT to waiting-for-SIPI state */ + if (vlapic2->boot_state == BS_INIT) { + vlapic2->boot_state = BS_SIPI; + } + + return (0); + } + } + + if (mode == APIC_DELMODE_STARTUP) { + if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { + vlapic2 = vm_lapic(vlapic->vm, dest); + + /* + * Ignore SIPIs in any state other than wait-for-SIPI + */ + if (vlapic2->boot_state != BS_SIPI) + return (0); + + vlapic2->boot_state = BS_RUNNING; + + *retu = true; + vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); + vmexit->exitcode = VM_EXITCODE_SPINUP_AP; + vmexit->u.spinup_ap.vcpu = dest; + vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; + + return (0); + } + } + + /* + * This will cause a return to userland. + */ + return (1); +} + +void +vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) +{ + int vec; + + KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode")); + + vec = val & 0xff; + lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec); + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, + vlapic->vcpuid, 1); + VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); +} + +int +vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct LAPIC *lapic = vlapic->apic_page; + int idx, i, bitpos, vector; + uint32_t *irrptr, val; + + if (vlapic->ops.pending_intr) + return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); + + irrptr = &lapic->irr0; + + for (i = 7; i >= 0; i--) { + idx = i * 4; + val = atomic_load_acq_int(&irrptr[idx]); + bitpos = fls(val); + if (bitpos != 0) { + vector = i * 32 + (bitpos - 1); + if (PRIO(vector) > PRIO(lapic->ppr)) { + VLAPIC_CTR1(vlapic, "pending intr %d", vector); + if (vecptr != NULL) + *vecptr = vector; + return (1); + } else + break; + } + } + return (0); +} + +void +vlapic_intr_accepted(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *irrptr, *isrptr; + int idx, stk_top; + + if (vlapic->ops.intr_accepted) + return ((*vlapic->ops.intr_accepted)(vlapic, vector)); + + /* + * clear the ready bit for vector being accepted in irr + * and set the vector as in service in isr. + */ + idx = (vector / 32) * 4; + + irrptr = &lapic->irr0; + atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); + + isrptr = &lapic->isr0; + isrptr[idx] |= 1 << (vector % 32); + VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); + + /* + * Update the PPR + */ + vlapic->isrvec_stk_top++; + + stk_top = vlapic->isrvec_stk_top; + if (stk_top >= ISRVEC_STK_SIZE) + panic("isrvec_stk_top overflow %d", stk_top); + + vlapic->isrvec_stk[stk_top] = vector; + vlapic_update_ppr(vlapic); +} + +void +vlapic_svr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + uint32_t old, new, changed; + + lapic = vlapic->apic_page; + + new = lapic->svr; + old = vlapic->svr_last; + vlapic->svr_last = new; + + changed = old ^ new; + if ((changed & APIC_SVR_ENABLE) != 0) { + if ((new & APIC_SVR_ENABLE) == 0) { + /* + * The apic is now disabled so stop the apic timer + * and mask all the LVT entries. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + VLAPIC_TIMER_UNLOCK(vlapic); + vlapic_mask_lvts(vlapic); + } else { + /* + * The apic is now enabled so restart the apic timer + * if it is configured in periodic mode. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); + if (vlapic_periodic_timer(vlapic)) + vlapic_icrtmr_write_handler(vlapic); + } + } +} + +int +vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *reg; + int i; + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode", + offset); + *data = 0; + goto done; + } + + if (!x2apic(vlapic) && !mmio_access) { + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in " + "xAPIC mode", offset); + *data = 0; + goto done; + } + + if (offset > sizeof(*lapic)) { + *data = 0; + goto done; + } + + offset &= ~3; + switch(offset) + { + case APIC_OFFSET_ID: + *data = lapic->id; + break; + case APIC_OFFSET_VER: + *data = lapic->version; + break; + case APIC_OFFSET_TPR: + *data = vlapic_get_tpr(vlapic); + break; + case APIC_OFFSET_APR: + *data = lapic->apr; + break; + case APIC_OFFSET_PPR: + *data = lapic->ppr; + break; + case APIC_OFFSET_EOI: + *data = lapic->eoi; + break; + case APIC_OFFSET_LDR: + *data = lapic->ldr; + break; + case APIC_OFFSET_DFR: + *data = lapic->dfr; + break; + case APIC_OFFSET_SVR: + *data = lapic->svr; + break; + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + i = (offset - APIC_OFFSET_ISR0) >> 2; + reg = &lapic->isr0; + *data = *(reg + i); + break; + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + i = (offset - APIC_OFFSET_TMR0) >> 2; + reg = &lapic->tmr0; + *data = *(reg + i); + break; + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + i = (offset - APIC_OFFSET_IRR0) >> 2; + reg = &lapic->irr0; + *data = atomic_load_acq_int(reg + i); + break; + case APIC_OFFSET_ESR: + *data = lapic->esr; + break; + case APIC_OFFSET_ICR_LOW: + *data = lapic->icr_lo; + if (x2apic(vlapic)) + *data |= (uint64_t)lapic->icr_hi << 32; + break; + case APIC_OFFSET_ICR_HI: + *data = lapic->icr_hi; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + *data = vlapic_get_lvt(vlapic, offset); +#ifdef INVARIANTS + reg = vlapic_get_lvtptr(vlapic, offset); + KASSERT(*data == *reg, ("inconsistent lvt value at " + "offset %#lx: %#lx/%#x", offset, *data, *reg)); +#endif + break; + case APIC_OFFSET_TIMER_ICR: + *data = lapic->icr_timer; + break; + case APIC_OFFSET_TIMER_CCR: + *data = vlapic_get_ccr(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + *data = lapic->dcr_timer; + break; + case APIC_OFFSET_SELF_IPI: + /* + * XXX generate a GP fault if vlapic is in x2apic mode + */ + *data = 0; + break; + case APIC_OFFSET_RRR: + default: + *data = 0; + break; + } +done: + VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); + return 0; +} + +int +vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *regptr; + int retval; + +#ifdef __FreeBSD__ + KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, + ("vlapic_write: invalid offset %#lx", offset)); +#else + KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, + ("vlapic_write: invalid offset %lx", offset)); +#endif + + VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx", + offset, data); + + if (offset > sizeof(*lapic)) + return (0); + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx " + "in x2APIC mode", data, offset); + return (0); + } + + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + if (!x2apic(vlapic) && !mmio_access) { + VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx " + "in xAPIC mode", data, offset); + return (0); + } + + retval = 0; + switch(offset) + { + case APIC_OFFSET_ID: + lapic->id = data; + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_TPR: + vlapic_set_tpr(vlapic, data & 0xff); + break; + case APIC_OFFSET_EOI: + vlapic_process_eoi(vlapic); + break; + case APIC_OFFSET_LDR: + lapic->ldr = data; + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + lapic->dfr = data; + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + lapic->svr = data; + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + lapic->icr_lo = data; + if (x2apic(vlapic)) + lapic->icr_hi = data >> 32; + retval = vlapic_icrlo_write_handler(vlapic, retu); + break; + case APIC_OFFSET_ICR_HI: + lapic->icr_hi = data; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + regptr = vlapic_get_lvtptr(vlapic, offset); + *regptr = data; + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + lapic->icr_timer = data; + vlapic_icrtmr_write_handler(vlapic); + break; + + case APIC_OFFSET_TIMER_DCR: + lapic->dcr_timer = data; + vlapic_dcr_write_handler(vlapic); + break; + + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + + case APIC_OFFSET_SELF_IPI: + if (x2apic(vlapic)) + vlapic_self_ipi_handler(vlapic, data); + break; + + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_TIMER_CCR: + default: + // Read only. + break; + } + + return (retval); +} + +static void +vlapic_reset(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + bzero(lapic, sizeof(struct LAPIC)); + + lapic->id = vlapic_get_id(vlapic); + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic_mask_lvts(vlapic); + vlapic_tmr_reset(vlapic); + + lapic->dcr_timer = 0; + vlapic_dcr_write_handler(vlapic); + + if (vlapic->vcpuid == 0) + vlapic->boot_state = BS_RUNNING; /* BSP */ + else + vlapic->boot_state = BS_INIT; /* AP */ + + vlapic->svr_last = lapic->svr; +} + +void +vlapic_init(struct vlapic *vlapic) +{ + KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); + KASSERT(vlapic->vcpuid >= 0 && + vlapic->vcpuid < vm_get_maxcpus(vlapic->vm), + ("vlapic_init: vcpuid is not initialized")); + KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " + "initialized")); + + /* + * If the vlapic is configured in x2apic mode then it will be + * accessed in the critical section via the MSR emulation code. + * + * Therefore the timer mutex must be a spinlock because blockable + * mutexes cannot be acquired in a critical section. + */ + mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); + callout_init(&vlapic->callout, 1); + + vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; + + if (vlapic->vcpuid == 0) + vlapic->msr_apicbase |= APICBASE_BSP; + + vlapic_reset(vlapic); +} + +void +vlapic_cleanup(struct vlapic *vlapic) +{ + + callout_drain(&vlapic->callout); +} + +uint64_t +vlapic_get_apicbase(struct vlapic *vlapic) +{ + + return (vlapic->msr_apicbase); +} + +int +vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) +{ + + if (vlapic->msr_apicbase != new) { + VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx " + "not supported", vlapic->msr_apicbase, new); + return (-1); + } + + return (0); +} + +void +vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + struct vlapic *vlapic; + struct LAPIC *lapic; + + vlapic = vm_lapic(vm, vcpuid); + + if (state == X2APIC_DISABLED) + vlapic->msr_apicbase &= ~APICBASE_X2APIC; + else + vlapic->msr_apicbase |= APICBASE_X2APIC; + + /* + * Reset the local APIC registers whose values are mode-dependent. + * + * XXX this works because the APIC mode can be changed only at vcpu + * initialization time. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); + if (x2apic(vlapic)) { + lapic->ldr = x2apic_ldr(vlapic); + lapic->dfr = 0; + } else { + lapic->ldr = 0; + lapic->dfr = 0xffffffff; + } + + if (state == X2APIC_ENABLED) { + if (vlapic->ops.enable_x2apic_mode) + (*vlapic->ops.enable_x2apic_mode)(vlapic); + } +} + +void +vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec) +{ + bool lowprio; + int vcpuid; + cpuset_t dmask; + + if (delmode != IOART_DELFIXED && + delmode != IOART_DELLOPRI && + delmode != IOART_DELEXINT) { + VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); + return; + } + lowprio = (delmode == IOART_DELLOPRI); + + /* + * We don't provide any virtual interrupt redirection hardware so + * all interrupts originating from the ioapic or MSI specify the + * 'dest' in the legacy xAPIC format. + */ + vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); + + while ((vcpuid = CPU_FFS(&dmask)) != 0) { + vcpuid--; + CPU_CLR(vcpuid, &dmask); + if (delmode == IOART_DELEXINT) { + vm_inject_extint(vm, vcpuid); + } else { + lapic_set_intr(vm, vcpuid, vec, level); + } + } +} + +void +vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) +{ + /* + * Post an interrupt to the vcpu currently running on 'hostcpu'. + * + * This is done by leveraging features like Posted Interrupts (Intel) + * Doorbell MSR (AMD AVIC) that avoid a VM exit. + * + * If neither of these features are available then fallback to + * sending an IPI to 'hostcpu'. + */ + if (vlapic->ops.post_intr) + (*vlapic->ops.post_intr)(vlapic, hostcpu); + else + ipi_cpu(hostcpu, ipinum); +} + +bool +vlapic_enabled(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && + (lapic->svr & APIC_SVR_ENABLE) != 0) + return (true); + else + return (false); +} + +static void +vlapic_tmr_reset(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->tmr0 = lapic->tmr1 = lapic->tmr2 = lapic->tmr3 = 0; + lapic->tmr4 = lapic->tmr5 = lapic->tmr6 = lapic->tmr7 = 0; + vlapic->tmr_pending = 1; +} + +/* + * Synchronize TMR designations into the LAPIC state. + * The vCPU must be in the VCPU_RUNNING state. + */ +void +vlapic_tmr_update(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + uint32_t *tmrptr; + uint32_t result[VLAPIC_TMR_CNT]; + u_int i, tmr_idx; + + if (vlapic->tmr_pending == 0) { + return; + } + + lapic = vlapic->apic_page; + tmrptr = &lapic->tmr0; + + VLAPIC_CTR0(vlapic, "synchronizing TMR"); + for (i = 0; i < VLAPIC_TMR_CNT; i++) { + tmr_idx = i * 4; + + tmrptr[tmr_idx] &= ~vlapic->tmr_vec_deassert[i]; + tmrptr[tmr_idx] |= vlapic->tmr_vec_assert[i]; + vlapic->tmr_vec_deassert[i] = 0; + vlapic->tmr_vec_assert[i] = 0; + result[i] = tmrptr[tmr_idx]; + } + vlapic->tmr_pending = 0; + + if (vlapic->ops.set_tmr != NULL) { + (*vlapic->ops.set_tmr)(vlapic, result); + } +} + +/* + * Designate the TMR state for a given interrupt vector. + * The caller must hold the vIOAPIC lock and prevent the vCPU corresponding to + * this vLAPIC instance from being-in or entering the VCPU_RUNNING state. + */ +void +vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active) +{ + const uint32_t idx = vector / 32; + const uint32_t mask = 1 << (vector % 32); + + VLAPIC_CTR2(vlapic, "TMR for vector %u %sasserted", vector, + active ? "" : "de"); + if (active) { + vlapic->tmr_vec_assert[idx] |= mask; + vlapic->tmr_vec_deassert[idx] &= ~mask; + } else { + vlapic->tmr_vec_deassert[idx] |= mask; + vlapic->tmr_vec_assert[idx] &= ~mask; + } + + /* + * Track the number of TMR changes between calls to vlapic_tmr_update. + * While a simple boolean would suffice, this count may be useful when + * tracing or debugging, and is cheap to calculate. + */ + vlapic->tmr_pending = MIN(UINT32_MAX - 1, vlapic->tmr_pending) + 1; +} + +#ifndef __FreeBSD__ +void +vlapic_localize_resources(struct vlapic *vlapic) +{ + vmm_glue_callout_localize(&vlapic->callout); +} +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h new file mode 100644 index 0000000000..e1a52551a9 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h @@ -0,0 +1,114 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VLAPIC_H_ +#define _VLAPIC_H_ + +struct vm; +enum x2apic_state; + +int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu); +int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu); + +/* + * Returns 0 if there is no eligible vector that can be delivered to the + * guest at this time and non-zero otherwise. + * + * If an eligible vector number is found and 'vecptr' is not NULL then it will + * be stored in the location pointed to by 'vecptr'. + * + * Note that the vector does not automatically transition to the ISR as a + * result of calling this function. + */ +int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr); + +/* + * Transition 'vector' from IRR to ISR. This function is called with the + * vector returned by 'vlapic_pending_intr()' when the guest is able to + * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that + * block interrupt delivery). + */ +void vlapic_intr_accepted(struct vlapic *vlapic, int vector); + +/* + * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise. + */ +int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); + +/* + * Post an interrupt to the vcpu running on 'hostcpu'. This will use a + * hardware assist if available (e.g. Posted Interrupt) or fall back to + * sending an 'ipinum' to interrupt the 'hostcpu'. + */ +void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); + +void vlapic_fire_cmci(struct vlapic *vlapic); +int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); + +uint64_t vlapic_get_apicbase(struct vlapic *vlapic); +int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); +void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s); +bool vlapic_enabled(struct vlapic *vlapic); + +void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec); + +void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest); + +void vlapic_tmr_update(struct vlapic *vlapic); +void vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active); + +void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val); +uint64_t vlapic_get_cr8(struct vlapic *vlapic); + +/* APIC write handlers */ +void vlapic_id_write_handler(struct vlapic *vlapic); +void vlapic_ldr_write_handler(struct vlapic *vlapic); +void vlapic_dfr_write_handler(struct vlapic *vlapic); +void vlapic_svr_write_handler(struct vlapic *vlapic); +void vlapic_esr_write_handler(struct vlapic *vlapic); +int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu); +void vlapic_icrtmr_write_handler(struct vlapic *vlapic); +void vlapic_dcr_write_handler(struct vlapic *vlapic); +void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); +void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); + +#ifndef __FreeBSD__ +void vlapic_localize_resources(struct vlapic *vlapic); +#endif + +#endif /* _VLAPIC_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h new file mode 100644 index 0000000000..5795d48d52 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h @@ -0,0 +1,207 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VLAPIC_PRIV_H_ +#define _VLAPIC_PRIV_H_ + +#include <x86/apicreg.h> + +/* + * APIC Register: Offset Description + */ +#define APIC_OFFSET_ID 0x20 /* Local APIC ID */ +#define APIC_OFFSET_VER 0x30 /* Local APIC Version */ +#define APIC_OFFSET_TPR 0x80 /* Task Priority Register */ +#define APIC_OFFSET_APR 0x90 /* Arbitration Priority */ +#define APIC_OFFSET_PPR 0xA0 /* Processor Priority Register */ +#define APIC_OFFSET_EOI 0xB0 /* EOI Register */ +#define APIC_OFFSET_RRR 0xC0 /* Remote read */ +#define APIC_OFFSET_LDR 0xD0 /* Logical Destination */ +#define APIC_OFFSET_DFR 0xE0 /* Destination Format Register */ +#define APIC_OFFSET_SVR 0xF0 /* Spurious Vector Register */ +#define APIC_OFFSET_ISR0 0x100 /* In Service Register */ +#define APIC_OFFSET_ISR1 0x110 +#define APIC_OFFSET_ISR2 0x120 +#define APIC_OFFSET_ISR3 0x130 +#define APIC_OFFSET_ISR4 0x140 +#define APIC_OFFSET_ISR5 0x150 +#define APIC_OFFSET_ISR6 0x160 +#define APIC_OFFSET_ISR7 0x170 +#define APIC_OFFSET_TMR0 0x180 /* Trigger Mode Register */ +#define APIC_OFFSET_TMR1 0x190 +#define APIC_OFFSET_TMR2 0x1A0 +#define APIC_OFFSET_TMR3 0x1B0 +#define APIC_OFFSET_TMR4 0x1C0 +#define APIC_OFFSET_TMR5 0x1D0 +#define APIC_OFFSET_TMR6 0x1E0 +#define APIC_OFFSET_TMR7 0x1F0 +#define APIC_OFFSET_IRR0 0x200 /* Interrupt Request Register */ +#define APIC_OFFSET_IRR1 0x210 +#define APIC_OFFSET_IRR2 0x220 +#define APIC_OFFSET_IRR3 0x230 +#define APIC_OFFSET_IRR4 0x240 +#define APIC_OFFSET_IRR5 0x250 +#define APIC_OFFSET_IRR6 0x260 +#define APIC_OFFSET_IRR7 0x270 +#define APIC_OFFSET_ESR 0x280 /* Error Status Register */ +#define APIC_OFFSET_CMCI_LVT 0x2F0 /* Local Vector Table (CMCI) */ +#define APIC_OFFSET_ICR_LOW 0x300 /* Interrupt Command Register */ +#define APIC_OFFSET_ICR_HI 0x310 +#define APIC_OFFSET_TIMER_LVT 0x320 /* Local Vector Table (Timer) */ +#define APIC_OFFSET_THERM_LVT 0x330 /* Local Vector Table (Thermal) */ +#define APIC_OFFSET_PERF_LVT 0x340 /* Local Vector Table (PMC) */ +#define APIC_OFFSET_LINT0_LVT 0x350 /* Local Vector Table (LINT0) */ +#define APIC_OFFSET_LINT1_LVT 0x360 /* Local Vector Table (LINT1) */ +#define APIC_OFFSET_ERROR_LVT 0x370 /* Local Vector Table (ERROR) */ +#define APIC_OFFSET_TIMER_ICR 0x380 /* Timer's Initial Count */ +#define APIC_OFFSET_TIMER_CCR 0x390 /* Timer's Current Count */ +#define APIC_OFFSET_TIMER_DCR 0x3E0 /* Timer's Divide Configuration */ +#define APIC_OFFSET_SELF_IPI 0x3F0 /* Self IPI register */ + +#define VLAPIC_CTR0(vlapic, format) \ + VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) + +#define VLAPIC_CTR1(vlapic, format, p1) \ + VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) + +#define VLAPIC_CTR2(vlapic, format, p1, p2) \ + VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2) + +#define VLAPIC_CTR3(vlapic, format, p1, p2, p3) \ + VCPU_CTR3((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2, p3) + +#define VLAPIC_CTR_IRR(vlapic, msg) \ +do { \ + uint32_t *irrptr = &(vlapic)->apic_page->irr0; \ + irrptr[0] = irrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \ +} while (0) + +#define VLAPIC_CTR_ISR(vlapic, msg) \ +do { \ + uint32_t *isrptr = &(vlapic)->apic_page->isr0; \ + isrptr[0] = isrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ +} while (0) + +enum boot_state { + BS_INIT, + BS_SIPI, + BS_RUNNING +}; + +/* + * 16 priority levels with at most one vector injected per level. + */ +#define ISRVEC_STK_SIZE (16 + 1) + +#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI + +#define VLAPIC_TMR_CNT 8 + +struct vlapic; + +struct vlapic_ops { + int (*set_intr_ready)(struct vlapic *vlapic, int vector, bool level); + int (*pending_intr)(struct vlapic *vlapic, int *vecptr); + void (*intr_accepted)(struct vlapic *vlapic, int vector); + void (*post_intr)(struct vlapic *vlapic, int hostcpu); + void (*set_tmr)(struct vlapic *vlapic, const uint32_t *result); + void (*enable_x2apic_mode)(struct vlapic *vlapic); +}; + +struct vlapic { + struct vm *vm; + int vcpuid; + struct LAPIC *apic_page; + struct vlapic_ops ops; + + uint32_t esr_pending; + uint32_t tmr_pending; + + struct callout callout; /* vlapic timer */ + struct bintime timer_fire_bt; /* callout expiry time */ + struct bintime timer_freq_bt; /* timer frequency */ + struct bintime timer_period_bt; /* timer period */ + struct mtx timer_mtx; + + /* + * The 'isrvec_stk' is a stack of vectors injected by the local apic. + * A vector is popped from the stack when the processor does an EOI. + * The vector on the top of the stack is used to compute the + * Processor Priority in conjunction with the TPR. + */ + uint8_t isrvec_stk[ISRVEC_STK_SIZE]; + int isrvec_stk_top; + + uint64_t msr_apicbase; + enum boot_state boot_state; + + /* + * Copies of some registers in the virtual APIC page. We do this for + * a couple of different reasons: + * - to be able to detect what changed (e.g. svr_last) + * - to maintain a coherent snapshot of the register (e.g. lvt_last) + */ + uint32_t svr_last; + uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; + + /* + * Store intended modifications to the trigger-mode register state. + * Along with the tmr_pending counter above, these are protected by the + * vIOAPIC lock and can only be modified under specific conditions: + * + * 1. When holding the vIOAPIC lock, and the vCPU to which the vLAPIC + * belongs is prevented from entering the VCPU_RUNNING state. + * 2. When the owning vCPU is in the VCPU_RUNNING state, and is + * applying the TMR modifications prior to interrupt injection. + */ + uint32_t tmr_vec_deassert[VLAPIC_TMR_CNT]; + uint32_t tmr_vec_assert[VLAPIC_TMR_CNT]; +}; + +void vlapic_init(struct vlapic *vlapic); +void vlapic_cleanup(struct vlapic *vlapic); + +#endif /* _VLAPIC_PRIV_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c new file mode 100644 index 0000000000..4df909777d --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c @@ -0,0 +1,105 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <machine/vmm.h> + +#include "vpmtmr.h" + +/* + * The ACPI Power Management timer is a free-running 24- or 32-bit + * timer with a frequency of 3.579545MHz + * + * This implementation will be 32-bits + */ + +#define PMTMR_FREQ 3579545 /* 3.579545MHz */ + +struct vpmtmr { + sbintime_t freq_sbt; + sbintime_t baseuptime; + uint32_t baseval; +}; + +static MALLOC_DEFINE(M_VPMTMR, "vpmtmr", "bhyve virtual acpi timer"); + +struct vpmtmr * +vpmtmr_init(struct vm *vm) +{ + struct vpmtmr *vpmtmr; + struct bintime bt; + + vpmtmr = malloc(sizeof(struct vpmtmr), M_VPMTMR, M_WAITOK | M_ZERO); + vpmtmr->baseuptime = sbinuptime(); + vpmtmr->baseval = 0; + + FREQ2BT(PMTMR_FREQ, &bt); + vpmtmr->freq_sbt = bttosbt(bt); + + return (vpmtmr); +} + +void +vpmtmr_cleanup(struct vpmtmr *vpmtmr) +{ + + free(vpmtmr, M_VPMTMR); +} + +int +vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vpmtmr *vpmtmr; + sbintime_t now, delta; + + if (!in || bytes != 4) + return (-1); + + vpmtmr = vm_pmtmr(vm); + + /* + * No locking needed because 'baseuptime' and 'baseval' are + * written only during initialization. + */ + now = sbinuptime(); + delta = now - vpmtmr->baseuptime; + KASSERT(delta >= 0, ("vpmtmr_handler: uptime went backwards: " + "%#lx to %#lx", vpmtmr->baseuptime, now)); + *val = vpmtmr->baseval + delta / vpmtmr->freq_sbt; + + return (0); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h new file mode 100644 index 0000000000..e6562da5c0 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h @@ -0,0 +1,44 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VPMTMR_H_ +#define _VPMTMR_H_ + +#define IO_PMTMR 0x408 + +struct vpmtmr; + +struct vpmtmr *vpmtmr_init(struct vm *vm); +void vpmtmr_cleanup(struct vpmtmr *pmtmr); + +int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.c b/usr/src/uts/i86pc/io/vmm/io/vrtc.c new file mode 100644 index 0000000000..f12d22fc26 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.c @@ -0,0 +1,1061 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/clock.h> +#include <sys/sysctl.h> + +#include <machine/vmm.h> + +#include <isa/rtc.h> + +#include "vmm_ktr.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vrtc.h" + +/* Register layout of the RTC */ +struct rtcdev { + uint8_t sec; + uint8_t alarm_sec; + uint8_t min; + uint8_t alarm_min; + uint8_t hour; + uint8_t alarm_hour; + uint8_t day_of_week; + uint8_t day_of_month; + uint8_t month; + uint8_t year; + uint8_t reg_a; + uint8_t reg_b; + uint8_t reg_c; + uint8_t reg_d; + uint8_t nvram[36]; + uint8_t century; + uint8_t nvram2[128 - 51]; +} __packed; +CTASSERT(sizeof(struct rtcdev) == 128); +CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); + +struct vrtc { + struct vm *vm; + struct mtx mtx; + struct callout callout; + u_int addr; /* RTC register to read or write */ + sbintime_t base_uptime; + time_t base_rtctime; + struct rtcdev rtcdev; +}; + +#define VRTC_LOCK(vrtc) mtx_lock(&((vrtc)->mtx)) +#define VRTC_UNLOCK(vrtc) mtx_unlock(&((vrtc)->mtx)) +#define VRTC_LOCKED(vrtc) mtx_owned(&((vrtc)->mtx)) + +/* + * RTC time is considered "broken" if: + * - RTC updates are halted by the guest + * - RTC date/time fields have invalid values + */ +#define VRTC_BROKEN_TIME ((time_t)-1) + +#define RTC_IRQ 8 +#define RTCSB_BIN 0x04 +#define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) +#define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) +#define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) +#define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) +#define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) + +static void vrtc_callout_handler(void *arg); +static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); + +static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL); + +static int rtc_flag_broken_time = 1; +SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN, + &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected"); + +static __inline bool +divider_enabled(int reg_a) +{ + /* + * The RTC is counting only when dividers are not held in reset. + */ + return ((reg_a & 0x70) == 0x20); +} + +static __inline bool +update_enabled(struct vrtc *vrtc) +{ + /* + * RTC date/time can be updated only if: + * - divider is not held in reset + * - guest has not disabled updates + * - the date/time fields have valid contents + */ + if (!divider_enabled(vrtc->rtcdev.reg_a)) + return (false); + + if (rtc_halted(vrtc)) + return (false); + + if (vrtc->base_rtctime == VRTC_BROKEN_TIME) + return (false); + + return (true); +} + +static time_t +vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime) +{ + sbintime_t now, delta; + time_t t, secs; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + t = vrtc->base_rtctime; + *basetime = vrtc->base_uptime; + if (update_enabled(vrtc)) { + now = sbinuptime(); + delta = now - vrtc->base_uptime; + KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " + "%#lx to %#lx", vrtc->base_uptime, now)); + secs = delta / SBT_1S; + t += secs; + *basetime += secs * SBT_1S; + } + return (t); +} + +static __inline uint8_t +rtcset(struct rtcdev *rtc, int val) +{ + + KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", + __func__, val)); + + return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]); +} + +static void +secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + int hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (rtctime < 0) { + KASSERT(rtctime == VRTC_BROKEN_TIME, + ("%s: invalid vrtc time %#lx", __func__, rtctime)); + return; + } + + /* + * If the RTC is halted then the guest has "ownership" of the + * date/time fields. Don't update the RTC date/time fields in + * this case (unless forced). + */ + if (rtc_halted(vrtc) && !force_update) + return; + + ts.tv_sec = rtctime; + ts.tv_nsec = 0; + clock_ts_to_ct(&ts, &ct); + + KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", + ct.sec)); + KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", + ct.min)); + KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", + ct.hour)); + KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", + ct.dow)); + KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", + ct.day)); + KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", + ct.mon)); + KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d", + ct.year)); + + rtc = &vrtc->rtcdev; + rtc->sec = rtcset(rtc, ct.sec); + rtc->min = rtcset(rtc, ct.min); + + if (rtc->reg_b & RTCSB_24HR) { + hour = ct.hour; + } else { + /* + * Convert to the 12-hour format. + */ + switch (ct.hour) { + case 0: /* 12 AM */ + case 12: /* 12 PM */ + hour = 12; + break; + default: + /* + * The remaining 'ct.hour' values are interpreted as: + * [1 - 11] -> 1 - 11 AM + * [13 - 23] -> 1 - 11 PM + */ + hour = ct.hour % 12; + break; + } + } + + rtc->hour = rtcset(rtc, hour); + + if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) + rtc->hour |= 0x80; /* set MSB to indicate PM */ + + rtc->day_of_week = rtcset(rtc, ct.dow + 1); + rtc->day_of_month = rtcset(rtc, ct.day); + rtc->month = rtcset(rtc, ct.mon); + rtc->year = rtcset(rtc, ct.year % 100); + rtc->century = rtcset(rtc, ct.year / 100); +} + +static int +rtcget(struct rtcdev *rtc, int val, int *retval) +{ + uint8_t upper, lower; + + if (rtc->reg_b & RTCSB_BIN) { + *retval = val; + return (0); + } + + lower = val & 0xf; + upper = (val >> 4) & 0xf; + + if (lower > 9 || upper > 9) + return (-1); + + *retval = upper * 10 + lower; + return (0); +} + +static time_t +rtc_to_secs(struct vrtc *vrtc) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; +#ifdef __FreeBSD__ + struct vm *vm; +#endif + int century, error, hour, pm, year; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + +#ifdef __FreeBSD__ + vm = vrtc->vm; +#endif + rtc = &vrtc->rtcdev; + + bzero(&ct, sizeof(struct clocktime)); + + error = rtcget(rtc, rtc->sec, &ct.sec); + if (error || ct.sec < 0 || ct.sec > 59) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->min, &ct.min); + if (error || ct.min < 0 || ct.min > 59) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min); +#endif + goto fail; + } + + pm = 0; + hour = rtc->hour; + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (hour & 0x80) { + hour &= ~0x80; + pm = 1; + } + } + error = rtcget(rtc, hour, &ct.hour); + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (ct.hour >= 1 && ct.hour <= 12) { + /* + * Convert from 12-hour format to internal 24-hour + * representation as follows: + * + * 12-hour format ct.hour + * 12 AM 0 + * 1 - 11 AM 1 - 11 + * 12 PM 12 + * 1 - 11 PM 13 - 23 + */ + if (ct.hour == 12) + ct.hour = 0; + if (pm) + ct.hour += 12; + } else { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d", + rtc->hour, ct.hour); +#endif + goto fail; + } + } + + if (error || ct.hour < 0 || ct.hour > 23) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour); +#endif + goto fail; + } + + /* + * Ignore 'rtc->dow' because some guests like Linux don't bother + * setting it at all while others like OpenBSD/i386 set it incorrectly. + * + * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. + */ + ct.dow = -1; + + error = rtcget(rtc, rtc->day_of_month, &ct.day); + if (error || ct.day < 1 || ct.day > 31) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month, + ct.day); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->month, &ct.mon); + if (error || ct.mon < 1 || ct.mon > 12) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->year, &year); + if (error || year < 0 || year > 99) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->century, ¢ury); + ct.year = century * 100 + year; + if (error || ct.year < POSIX_BASE_YEAR) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, + ct.year); +#endif + goto fail; + } + + error = clock_ct_to_ts(&ct, &ts); + if (error || ts.tv_sec < 0) { +#ifdef __FreeBSD__ + VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d", + ct.year, ct.mon, ct.day); + VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d", + ct.hour, ct.min, ct.sec); +#endif + goto fail; + } + return (ts.tv_sec); /* success */ +fail: + /* + * Stop updating the RTC if the date/time fields programmed by + * the guest are invalid. + */ +#ifdef __FreeBSD__ + VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); +#endif + return (VRTC_BROKEN_TIME); +} + +static int +vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase) +{ + struct rtcdev *rtc; +#ifdef __FreeBSD__ + sbintime_t oldbase; +#endif + time_t oldtime; + uint8_t alarm_sec, alarm_min, alarm_hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + alarm_sec = rtc->alarm_sec; + alarm_min = rtc->alarm_min; + alarm_hour = rtc->alarm_hour; + + oldtime = vrtc->base_rtctime; + VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx", + oldtime, newtime); + +#ifdef __FreeBSD__ + oldbase = vrtc->base_uptime; + VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx", + oldbase, newbase); +#endif + vrtc->base_uptime = newbase; + + if (newtime == oldtime) + return (0); + + /* + * If 'newtime' indicates that RTC updates are disabled then just + * record that and return. There is no need to do alarm interrupt + * processing in this case. + */ + if (newtime == VRTC_BROKEN_TIME) { + vrtc->base_rtctime = VRTC_BROKEN_TIME; + return (0); + } + + /* + * Return an error if RTC updates are halted by the guest. + */ + if (rtc_halted(vrtc)) { + VM_CTR0(vrtc->vm, "RTC update halted by guest"); + return (EBUSY); + } + + do { + /* + * If the alarm interrupt is enabled and 'oldtime' is valid + * then visit all the seconds between 'oldtime' and 'newtime' + * to check for the alarm condition. + * + * Otherwise move the RTC time forward directly to 'newtime'. + */ + if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) + vrtc->base_rtctime++; + else + vrtc->base_rtctime = newtime; + + if (aintr_enabled(vrtc)) { + /* + * Update the RTC date/time fields before checking + * if the alarm conditions are satisfied. + */ + secs_to_rtc(vrtc->base_rtctime, vrtc, 0); + + if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && + (alarm_min >= 0xC0 || alarm_min == rtc->min) && + (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); + } + } + } while (vrtc->base_rtctime != newtime); + + if (uintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); + + return (0); +} + +static sbintime_t +vrtc_freq(struct vrtc *vrtc) +{ + int ratesel; + + static sbintime_t pf[16] = { + 0, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 8192, + SBT_1S / 4096, + SBT_1S / 2048, + SBT_1S / 1024, + SBT_1S / 512, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 64, + SBT_1S / 32, + SBT_1S / 16, + SBT_1S / 8, + SBT_1S / 4, + SBT_1S / 2, + }; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + /* + * If both periodic and alarm interrupts are enabled then use the + * periodic frequency to drive the callout. The minimum periodic + * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so + * piggyback the alarm on top of it. The same argument applies to + * the update interrupt. + */ + if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { + ratesel = vrtc->rtcdev.reg_a & 0xf; + return (pf[ratesel]); + } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else { + return (0); + } +} + +static void +vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt) +{ + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (freqsbt == 0) { + if (callout_active(&vrtc->callout)) { + VM_CTR0(vrtc->vm, "RTC callout stopped"); + callout_stop(&vrtc->callout); + } + return; + } + VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt); + callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler, + vrtc, 0); +} + +static void +vrtc_callout_handler(void *arg) +{ + struct vrtc *vrtc = arg; + sbintime_t freqsbt, basetime; + time_t rtctime; + int error; + + VM_CTR0(vrtc->vm, "vrtc callout fired"); + + VRTC_LOCK(vrtc); + if (callout_pending(&vrtc->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vrtc->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vrtc->callout); + + KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, + ("gratuitous vrtc callout")); + + if (pintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); + + if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { + rtctime = vrtc_curtime(vrtc, &basetime); + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("%s: vrtc_time_update error %d", + __func__, error)); + } + + freqsbt = vrtc_freq(vrtc); + KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__)); + vrtc_callout_reset(vrtc, freqsbt); +done: + VRTC_UNLOCK(vrtc); +} + +static __inline void +vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq) +{ + int active; + + active = callout_active(&vrtc->callout) ? 1 : 0; + KASSERT((freq == 0 && !active) || (freq != 0 && active), + ("vrtc callout %s with frequency %#lx", + active ? "active" : "inactive", freq)); +} + +static void +vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + int oldirqf, newirqf; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; + + oldirqf = rtc->reg_c & RTCIR_INT; + if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || + (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || + (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { + newirqf = RTCIR_INT; + } else { + newirqf = 0; + } + + oldval = rtc->reg_c; + rtc->reg_c = newirqf | newval; + changed = oldval ^ rtc->reg_c; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x", + oldval, rtc->reg_c); + } + + if (!oldirqf && newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); + vatpic_pulse_irq(vrtc->vm, RTC_IRQ); + vioapic_pulse_irq(vrtc->vm, RTC_IRQ); + } else if (oldirqf && !newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); + } +} + +static int +vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + sbintime_t oldfreq, newfreq, basetime; + time_t curtime, rtctime; + int error; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + oldval = rtc->reg_b; + oldfreq = vrtc_freq(vrtc); + + rtc->reg_b = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x", + oldval, newval); + } + + if (changed & RTCSB_HALT) { + if ((newval & RTCSB_HALT) == 0) { + rtctime = rtc_to_secs(vrtc); + basetime = sbinuptime(); + if (rtctime == VRTC_BROKEN_TIME) { + if (rtc_flag_broken_time) + return (-1); + } + } else { + curtime = vrtc_curtime(vrtc, &basetime); + KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " + "between vrtc basetime (%#lx) and curtime (%#lx)", + __func__, vrtc->base_rtctime, curtime)); + + /* + * Force a refresh of the RTC date/time fields so + * they reflect the time right before the guest set + * the HALT bit. + */ + secs_to_rtc(curtime, vrtc, 1); + + /* + * Updates are halted so mark 'base_rtctime' to denote + * that the RTC date/time is in flux. + */ + rtctime = VRTC_BROKEN_TIME; + rtc->reg_b &= ~RTCSB_UINTR; + } + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("vrtc_time_update error %d", error)); + } + + /* + * Side effect of changes to the interrupt enable bits. + */ + if (changed & RTCSB_ALL_INTRS) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); + + /* + * Change the callout frequency if it has changed. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); + + /* + * The side effect of bits that control the RTC date/time format + * is handled lazily when those fields are actually read. + */ + return (0); +} + +static void +vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) +{ + sbintime_t oldfreq, newfreq; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + newval &= ~RTCSA_TUP; + oldval = vrtc->rtcdev.reg_a; + oldfreq = vrtc_freq(vrtc); + + if (divider_enabled(oldval) && !divider_enabled(newval)) { + VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else if (!divider_enabled(oldval) && divider_enabled(newval)) { + /* + * If the dividers are coming out of reset then update + * 'base_uptime' before this happens. This is done to + * maintain the illusion that the RTC date/time was frozen + * while the dividers were disabled. + */ + vrtc->base_uptime = sbinuptime(); + VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else { + /* NOTHING */ + } + + vrtc->rtcdev.reg_a = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x", + oldval, newval); + } + + /* + * Side effect of changes to rate select and divider enable bits. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); +} + +int +vrtc_set_time(struct vm *vm, time_t secs) +{ + struct vrtc *vrtc; + int error; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + error = vrtc_time_update(vrtc, secs, sbinuptime()); + VRTC_UNLOCK(vrtc); + + if (error) { + VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error, + secs); + } else { + VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs); + } + + return (error); +} + +time_t +vrtc_get_time(struct vm *vm) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t t; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + t = vrtc_curtime(vrtc, &basetime); + VRTC_UNLOCK(vrtc); + + return (t); +} + +int +vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) +{ + struct vrtc *vrtc; + uint8_t *ptr; + + vrtc = vm_rtc(vm); + + /* + * Don't allow writes to RTC control registers or the date/time fields. + */ + if (offset < offsetof(struct rtcdev, nvram[0]) || + offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) { + VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", + offset); + return (EINVAL); + } + + VRTC_LOCK(vrtc); + ptr = (uint8_t *)(&vrtc->rtcdev); + ptr[offset] = value; + VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset); + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t curtime; + uint8_t *ptr; + + /* + * Allow all offsets in the RTC to be read. + */ + if (offset < 0 || offset >= sizeof(struct rtcdev)) + return (EINVAL); + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + + /* + * Update RTC date/time fields if necessary. + */ + if (offset < 10 || offset == RTC_CENTURY) { + curtime = vrtc_curtime(vrtc, &basetime); + secs_to_rtc(curtime, vrtc, 0); + } + + ptr = (uint8_t *)(&vrtc->rtcdev); + *retval = ptr[offset]; + + VRTC_UNLOCK(vrtc); + return (0); +} + +int +vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vrtc *vrtc; + + vrtc = vm_rtc(vm); + + if (bytes != 1) + return (-1); + + if (in) { + *val = 0xff; + return (0); + } + + VRTC_LOCK(vrtc); + vrtc->addr = *val & 0x7f; + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + sbintime_t basetime; + time_t curtime; + int error, offset; + + vrtc = vm_rtc(vm); + rtc = &vrtc->rtcdev; + + if (bytes != 1) + return (-1); + + VRTC_LOCK(vrtc); + offset = vrtc->addr; + if (offset >= sizeof(struct rtcdev)) { + VRTC_UNLOCK(vrtc); + return (-1); + } + + error = 0; + curtime = vrtc_curtime(vrtc, &basetime); + vrtc_time_update(vrtc, curtime, basetime); + + /* + * Update RTC date/time fields if necessary. + * + * This is not just for reads of the RTC. The side-effect of writing + * the century byte requires other RTC date/time fields (e.g. sec) + * to be updated here. + */ + if (offset < 10 || offset == RTC_CENTURY) + secs_to_rtc(curtime, vrtc, 0); + + if (in) { + if (offset == 12) { + /* + * XXX + * reg_c interrupt flags are updated only if the + * corresponding interrupt enable bit in reg_b is set. + */ + *val = vrtc->rtcdev.reg_c; + vrtc_set_reg_c(vrtc, 0); + } else { + *val = *((uint8_t *)rtc + offset); + } + VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x", + *val, offset); + } else { + switch (offset) { + case 10: + VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val); + vrtc_set_reg_a(vrtc, *val); + break; + case 11: + VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val); + error = vrtc_set_reg_b(vrtc, *val); + break; + case 12: + VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)", + *val); + break; + case 13: + VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)", + *val); + break; + case 0: + /* + * High order bit of 'seconds' is readonly. + */ + *val &= 0x7f; + /* FALLTHRU */ + default: + VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x", + offset, *val); + *((uint8_t *)rtc + offset) = *val; + break; + } + + /* + * XXX some guests (e.g. OpenBSD) write the century byte + * outside of RTCSB_HALT so re-calculate the RTC date/time. + */ + if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { + curtime = rtc_to_secs(vrtc); + error = vrtc_time_update(vrtc, curtime, sbinuptime()); + KASSERT(!error, ("vrtc_time_update error %d", error)); + if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) + error = -1; + } + } + VRTC_UNLOCK(vrtc); + return (error); +} + +void +vrtc_reset(struct vrtc *vrtc) +{ + struct rtcdev *rtc; + + VRTC_LOCK(vrtc); + + rtc = &vrtc->rtcdev; + vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); + vrtc_set_reg_c(vrtc, 0); + KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); + + VRTC_UNLOCK(vrtc); +} + +struct vrtc * +vrtc_init(struct vm *vm) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + time_t curtime; + + vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO); + vrtc->vm = vm; + mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF); + callout_init(&vrtc->callout, 1); + + /* Allow dividers to keep time but disable everything else */ + rtc = &vrtc->rtcdev; + rtc->reg_a = 0x20; + rtc->reg_b = RTCSB_24HR; + rtc->reg_c = 0; + rtc->reg_d = RTCSD_PWR; + + /* Reset the index register to a safe value. */ + vrtc->addr = RTC_STATUSD; + + /* + * Initialize RTC time to 00:00:00 Jan 1, 1970. + */ + curtime = 0; + + VRTC_LOCK(vrtc); + vrtc->base_rtctime = VRTC_BROKEN_TIME; + vrtc_time_update(vrtc, curtime, sbinuptime()); + secs_to_rtc(curtime, vrtc, 0); + VRTC_UNLOCK(vrtc); + + return (vrtc); +} + +void +vrtc_cleanup(struct vrtc *vrtc) +{ + + callout_drain(&vrtc->callout); + free(vrtc, M_VRTC); +} + +#ifndef __FreeBSD__ +void +vrtc_localize_resources(struct vrtc *vrtc) +{ + vmm_glue_callout_localize(&vrtc->callout); +} +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.h b/usr/src/uts/i86pc/io/vmm/io/vrtc.h new file mode 100644 index 0000000000..13abbedeb9 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.h @@ -0,0 +1,60 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VRTC_H_ +#define _VRTC_H_ + +#include <isa/isareg.h> + +struct vrtc; + +struct vrtc *vrtc_init(struct vm *vm); +void vrtc_cleanup(struct vrtc *vrtc); +void vrtc_reset(struct vrtc *vrtc); + +time_t vrtc_get_time(struct vm *vm); +int vrtc_set_time(struct vm *vm, time_t secs); +int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); +int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); + +int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); +int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); + +#ifndef __FreeBSD__ +void vrtc_localize_resources(struct vrtc *); +#endif + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vm/pmap.h b/usr/src/uts/i86pc/io/vmm/vm/pmap.h new file mode 100644 index 0000000000..512fc4acee --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/pmap.h @@ -0,0 +1,27 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _PMAP_VM_ +#define _PMAP_VM_ + +#include <machine/pmap.h> +#include "vm_glue.h" + +void pmap_invalidate_cache(void); +void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); +int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); +long pmap_wired_count(pmap_t pmap); + +#endif /* _PMAP_VM_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h b/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h new file mode 100644 index 0000000000..92a959960a --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h @@ -0,0 +1,35 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_EXTERN_H_ +#define _VM_EXTERN_H_ + +#include <sys/types.h> +#include <vm/vm.h> + +struct vmspace; +struct pmap; + +typedef int (*pmap_pinit_t)(struct pmap *pmap); + +struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t); +void vmspace_free(struct vmspace *); + +int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); +int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, + vm_prot_t prot, vm_page_t *ma, int max_count); + + +#endif /* _VM_EXTERN_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h new file mode 100644 index 0000000000..600872c321 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h @@ -0,0 +1,99 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VM_GLUE_ +#define _VM_GLUE_ + +#include <vm/pmap.h> +#include <vm/vm.h> +#include <sys/cpuvar.h> + +struct vmspace; +struct vm_map; +struct pmap; +struct vm_object; +struct vmm_pt_ops; + +struct vm_map { + struct vmspace *vmm_space; +}; + +struct pmap { + void *pm_pml4; + cpuset_t pm_active; + long pm_eptgen; + + /* Implementation private */ + enum pmap_type pm_type; + struct vmm_pt_ops *pm_ops; + void *pm_impl; +}; + +struct vmspace { + struct vm_map vm_map; + + /* Implementation private */ + kmutex_t vms_lock; + boolean_t vms_map_changing; + struct pmap vms_pmap; + uintptr_t vms_size; /* fixed after creation */ + + list_t vms_maplist; +}; + +typedef pfn_t (*vm_pager_fn_t)(vm_object_t, uintptr_t, pfn_t *, uint_t *); + +struct vm_object { + uint_t vmo_refcnt; /* manipulated with atomic ops */ + + /* This group of fields are fixed at creation time */ + objtype_t vmo_type; + size_t vmo_size; + vm_pager_fn_t vmo_pager; + void *vmo_data; + + kmutex_t vmo_lock; /* protects fields below */ + vm_memattr_t vmo_attr; +}; + +struct vm_page { + kmutex_t vmp_lock; + pfn_t vmp_pfn; + struct vm_object *vmp_obj_held; +}; + +/* Illumos-specific functions for setup and operation */ +int vm_segmap_obj(struct vmspace *, vm_object_t, struct as *, caddr_t *, + uint_t, uint_t, uint_t); +int vm_segmap_space(struct vmspace *, off_t, struct as *, caddr_t *, off_t, + uint_t, uint_t, uint_t); +void *vmspace_find_kva(struct vmspace *, uintptr_t, size_t); +void vmm_arena_init(void); +void vmm_arena_fini(void); + +struct vmm_pt_ops { + void * (*vpo_init)(uint64_t *); + void (*vpo_free)(void *); + uint64_t (*vpo_wired_cnt)(void *); + int (*vpo_is_wired)(void *, uint64_t, uint_t *); + int (*vpo_map)(void *, uint64_t, pfn_t, uint_t, uint_t, uint8_t); + uint64_t (*vpo_unmap)(void *, uint64_t, uint64_t); +}; + +extern struct vmm_pt_ops ept_ops; +extern struct vmm_pt_ops rvi_ops; + + +#endif /* _VM_GLUE_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_map.h b/usr/src/uts/i86pc/io/vmm/vm/vm_map.h new file mode 100644 index 0000000000..20b74d4d36 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_map.h @@ -0,0 +1,63 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_MAP_ +#define _VM_MAP_ + +#include "vm_glue.h" + +/* + * vm_map_wire and vm_map_unwire option flags + */ +#define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */ +#define VM_MAP_WIRE_USER 1 /* wiring in a user map */ + +#define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */ +#define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */ + +#define VM_MAP_WIRE_WRITE 4 /* Validate writable. */ + +/* + * The following "find_space" options are supported by vm_map_find(). + * + * For VMFS_ALIGNED_SPACE, the desired alignment is specified to + * the macro argument as log base 2 of the desired alignment. + */ +#define VMFS_NO_SPACE 0 /* don't find; use the given range */ +#define VMFS_ANY_SPACE 1 /* find range with any alignment */ +#define VMFS_OPTIMAL_SPACE 2 /* find range with optimal alignment */ +#define VMFS_SUPER_SPACE 3 /* find superpage-aligned range */ +#define VMFS_ALIGNED_SPACE(x) ((x) << 8) /* find range with fixed alignment */ + +/* + * vm_fault option flags + */ +#define VM_FAULT_NORMAL 0 /* Nothing special */ +#define VM_FAULT_WIRE 1 /* Wire the mapped page */ +#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */ + + + +pmap_t vmspace_pmap(struct vmspace *); + +int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, + vm_offset_t, int, vm_prot_t, vm_prot_t, int); +int vm_map_remove(vm_map_t, vm_offset_t, vm_offset_t); +int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); + +long vmspace_resident_count(struct vmspace *vmspace); + + +#endif /* _VM_MAP_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_object.h b/usr/src/uts/i86pc/io/vmm/vm/vm_object.h new file mode 100644 index 0000000000..1f16fa9b83 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_object.h @@ -0,0 +1,31 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_OBJECT_ +#define _VM_OBJECT_ + +#include "vm_glue.h" + +vm_object_t vm_object_allocate(objtype_t, vm_pindex_t); +void vm_object_deallocate(vm_object_t); +void vm_object_reference(vm_object_t); +int vm_object_set_memattr(vm_object_t, vm_memattr_t); +void vm_object_clear(vm_object_t); + + +#define VM_OBJECT_WLOCK(vmo) mutex_enter(&(vmo)->vmo_lock) +#define VM_OBJECT_WUNLOCK(vmo) mutex_exit(&(vmo)->vmo_lock) + +#endif /* _VM_OBJECT_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_page.h b/usr/src/uts/i86pc/io/vmm/vm/vm_page.h new file mode 100644 index 0000000000..4559fe6d4c --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_page.h @@ -0,0 +1,28 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + + +#ifndef _VM_PAGE_ +#define _VM_PAGE_ + +#include "vm_glue.h" + +void vm_page_lock(vm_page_t); +void vm_page_unhold(vm_page_t); +void vm_page_unlock(vm_page_t); + +#define VM_PAGE_TO_PHYS(page) (mmu_ptob((uintptr_t)((page)->vmp_pfn))) + +#endif /* _VM_PAGE_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h b/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h new file mode 100644 index 0000000000..11aa344f61 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_PAGER_ +#define _VM_PAGER_ + +vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t, + vm_ooffset_t, void *); + + +#endif /* _VM_PAGER_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c new file mode 100644 index 0000000000..47a5f26cb7 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -0,0 +1,3214 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/systm.h> + +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_param.h> + +#ifdef __FreeBSD__ +#include <machine/cpu.h> +#endif +#include <machine/pcb.h> +#include <machine/smp.h> +#include <machine/md_var.h> +#include <x86/psl.h> +#include <x86/apicreg.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> + +#include "vmm_ioport.h" +#include "vmm_ktr.h" +#include "vmm_host.h" +#include "vmm_mem.h" +#include "vmm_util.h" +#include "vatpic.h" +#include "vatpit.h" +#include "vhpet.h" +#include "vioapic.h" +#include "vlapic.h" +#include "vpmtmr.h" +#include "vrtc.h" +#include "vmm_stat.h" +#include "vmm_lapic.h" + +#include "io/ppt.h" +#include "io/iommu.h" + +struct vlapic; + +/* + * Initialization: + * (a) allocated when vcpu is created + * (i) initialized when vcpu is created and when it is reinitialized + * (o) initialized the first time the vcpu is created + * (x) initialized before use + */ +struct vcpu { + struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ + enum vcpu_state state; /* (o) vcpu state */ +#ifndef __FreeBSD__ + kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ + kcondvar_t state_cv; /* (o) IDLE-transition cv */ +#endif /* __FreeBSD__ */ + int hostcpu; /* (o) vcpu's current host cpu */ +#ifndef __FreeBSD__ + int lastloccpu; /* (o) last host cpu localized to */ +#endif + u_int runblock; /* (i) block vcpu from run state */ + int reqidle; /* (i) request vcpu to idle */ + struct vlapic *vlapic; /* (i) APIC device model */ + enum x2apic_state x2apic_state; /* (i) APIC mode */ + uint64_t exitintinfo; /* (i) events pending at VM exit */ + int nmi_pending; /* (i) NMI pending */ + int extint_pending; /* (i) INTR pending */ + int exception_pending; /* (i) exception pending */ + int exc_vector; /* (x) exception collateral */ + int exc_errcode_valid; + uint32_t exc_errcode; + struct savefpu *guestfpu; /* (a,i) guest fpu state */ + uint64_t guest_xcr0; /* (i) guest %xcr0 register */ + void *stats; /* (a,i) statistics */ + struct vm_exit exitinfo; /* (x) exit reason and collateral */ + uint64_t nextrip; /* (x) next instruction to execute */ +#ifndef __FreeBSD__ + uint64_t tsc_offset; /* (x) offset from host TSC */ +#endif +}; + +#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) +#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) + +struct mem_seg { + size_t len; + bool sysmem; + struct vm_object *object; +}; +#ifdef __FreeBSD__ +#define VM_MAX_MEMSEGS 3 +#else +#define VM_MAX_MEMSEGS 4 +#endif + +struct mem_map { + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff; + int segid; + int prot; + int flags; +}; +#define VM_MAX_MEMMAPS 4 + +/* + * Initialization: + * (o) initialized the first time the VM is created + * (i) initialized when VM is created and when it is reinitialized + * (x) initialized before use + */ +struct vm { + void *cookie; /* (i) cpu-specific data */ + void *iommu; /* (x) iommu-specific data */ + struct vhpet *vhpet; /* (i) virtual HPET */ + struct vioapic *vioapic; /* (i) virtual ioapic */ + struct vatpic *vatpic; /* (i) virtual atpic */ + struct vatpit *vatpit; /* (i) virtual atpit */ + struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ + struct vrtc *vrtc; /* (o) virtual RTC */ + volatile cpuset_t active_cpus; /* (i) active vcpus */ + volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ + int suspend; /* (i) stop VM execution */ + volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ + volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ + struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ + struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ + struct vmspace *vmspace; /* (o) guest's address space */ + char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ + struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ + /* The following describe the vm cpu topology */ + uint16_t sockets; /* (o) num of sockets */ + uint16_t cores; /* (o) num of cores/socket */ + uint16_t threads; /* (o) num of threads/core */ + uint16_t maxcpus; /* (o) max pluggable cpus */ +#ifndef __FreeBSD__ + list_t ioport_hooks; +#endif /* __FreeBSD__ */ +}; + +static int vmm_initialized; + +static struct vmm_ops *ops; +#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) +#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) +#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) + +#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) +#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO) +#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) +#define VMSPACE_ALLOC(min, max) \ + (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) +#define VMSPACE_FREE(vmspace) \ + (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) +#define VMGETREG(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETREG(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) +#define VMGETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMSETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMGETCAP(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETCAP(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) +#define VLAPIC_INIT(vmi, vcpu) \ + (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) +#define VLAPIC_CLEANUP(vmi, vlapic) \ + (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) + +#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) +#define fpu_stop_emulating() clts() + +SDT_PROVIDER_DEFINE(vmm); + +static MALLOC_DEFINE(M_VM, "vm", "vm"); + +/* statistics */ +static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +/* + * Halt the guest if all vcpus are executing a HLT instruction with + * interrupts disabled. + */ +static int halt_detection_enabled = 1; +SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, + &halt_detection_enabled, 0, + "Halt VM if all vcpus execute HLT with interrupts disabled"); + +static int vmm_ipinum; +SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, + "IPI vector used for vcpu notifications"); + +static int trace_guest_exceptions; +SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, + &trace_guest_exceptions, 0, + "Trap into hypervisor on all guest exceptions and reflect them back"); + +static void vm_free_memmap(struct vm *vm, int ident); +static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); +static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); + +#ifndef __FreeBSD__ +static void vm_clear_memseg(struct vm *, int); + +typedef struct vm_ioport_hook { + list_node_t vmih_node; + uint_t vmih_ioport; + void *vmih_arg; + vmm_rmem_cb_t vmih_rmem_cb; + vmm_wmem_cb_t vmih_wmem_cb; +} vm_ioport_hook_t; + +/* Flags for vtc_status */ +#define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ +#define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ + +typedef struct vm_thread_ctx { + struct vm *vtc_vm; + int vtc_vcpuid; + uint_t vtc_status; +} vm_thread_ctx_t; +#endif /* __FreeBSD__ */ + +#ifdef KTR +static const char * +vcpu_state2str(enum vcpu_state state) +{ + + switch (state) { + case VCPU_IDLE: + return ("idle"); + case VCPU_FROZEN: + return ("frozen"); + case VCPU_RUNNING: + return ("running"); + case VCPU_SLEEPING: + return ("sleeping"); + default: + return ("unknown"); + } +} +#endif + +static void +vcpu_cleanup(struct vm *vm, int i, bool destroy) +{ + struct vcpu *vcpu = &vm->vcpu[i]; + + VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); + if (destroy) { + vmm_stat_free(vcpu->stats); + fpu_save_area_free(vcpu->guestfpu); + } +} + +static void +vcpu_init(struct vm *vm, int vcpu_id, bool create) +{ + struct vcpu *vcpu; + + KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, + ("vcpu_init: invalid vcpu %d", vcpu_id)); + + vcpu = &vm->vcpu[vcpu_id]; + + if (create) { +#ifdef __FreeBSD__ + KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " + "initialized", vcpu_id)); +#endif + vcpu_lock_init(vcpu); + vcpu->state = VCPU_IDLE; + vcpu->hostcpu = NOCPU; +#ifndef __FreeBSD__ + vcpu->lastloccpu = NOCPU; +#endif + vcpu->guestfpu = fpu_save_area_alloc(); + vcpu->stats = vmm_stat_alloc(); + } + + vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); + vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); + vcpu->runblock = 0; + vcpu->reqidle = 0; + vcpu->exitintinfo = 0; + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + vcpu->exception_pending = 0; + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; + fpu_save_area_reset(vcpu->guestfpu); + vmm_stat_init(vcpu->stats); +} + +int +vcpu_trace_exceptions(struct vm *vm, int vcpuid) +{ + + return (trace_guest_exceptions); +} + +struct vm_exit * +vm_exitinfo(struct vm *vm, int cpuid) +{ + struct vcpu *vcpu; + + if (cpuid < 0 || cpuid >= vm->maxcpus) + panic("vm_exitinfo: invalid cpuid %d", cpuid); + + vcpu = &vm->vcpu[cpuid]; + + return (&vcpu->exitinfo); +} + +#ifdef __FreeBSD__ +static void +vmm_resume(void) +{ + VMM_RESUME(); +} +#endif + +static int +vmm_init(void) +{ + int error; + + vmm_host_state_init(); + +#ifdef __FreeBSD__ + vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); + if (vmm_ipinum < 0) + vmm_ipinum = IPI_AST; +#else + /* We use cpu_poke() for IPIs */ + vmm_ipinum = 0; +#endif + + error = vmm_mem_init(); + if (error) + return (error); + + if (vmm_is_intel()) + ops = &vmm_ops_intel; + else if (vmm_is_amd()) + ops = &vmm_ops_amd; + else + return (ENXIO); + +#ifdef __FreeBSD__ + vmm_resume_p = vmm_resume; +#endif + + return (VMM_INIT(vmm_ipinum)); +} + +#ifdef __FreeBSD__ + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + vmmdev_init(); + error = vmm_init(); + if (error == 0) + vmm_initialized = 1; + break; + case MOD_UNLOAD: + error = vmmdev_cleanup(); + if (error == 0) { + vmm_resume_p = NULL; + iommu_cleanup(); +#ifdef __FreeBSD__ + if (vmm_ipinum != IPI_AST) + lapic_ipi_free(vmm_ipinum); +#endif + error = VMM_CLEANUP(); + /* + * Something bad happened - prevent new + * VMs from being created + */ + if (error) + vmm_initialized = 0; + } + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * vmm initialization has the following dependencies: + * + * - VT-x initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +#else /* __FreeBSD__ */ + +int +vmm_mod_load() +{ + int error; + + error = vmm_init(); + if (error == 0) + vmm_initialized = 1; + + return (error); +} + +int +vmm_mod_unload() +{ + int error; + + iommu_cleanup(); + error = VMM_CLEANUP(); + if (error) + return (error); + vmm_initialized = 0; + + return (0); +} + +#endif /* __FreeBSD__ */ + +static void +vm_init(struct vm *vm, bool create) +{ + int i; +#ifndef __FreeBSD__ + uint64_t tsc_off; +#endif + + vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); + vm->iommu = NULL; + vm->vioapic = vioapic_init(vm); + vm->vhpet = vhpet_init(vm); + vm->vatpic = vatpic_init(vm); + vm->vatpit = vatpit_init(vm); + vm->vpmtmr = vpmtmr_init(vm); + if (create) + vm->vrtc = vrtc_init(vm); +#ifndef __FreeBSD__ + if (create) { + list_create(&vm->ioport_hooks, sizeof (vm_ioport_hook_t), + offsetof (vm_ioport_hook_t, vmih_node)); + } else { + VERIFY(list_is_empty(&vm->ioport_hooks)); + } +#endif /* __FreeBSD__ */ + + CPU_ZERO(&vm->active_cpus); + CPU_ZERO(&vm->debug_cpus); + + vm->suspend = 0; + CPU_ZERO(&vm->suspended_cpus); + + for (i = 0; i < vm->maxcpus; i++) + vcpu_init(vm, i, create); + +#ifndef __FreeBSD__ + tsc_off = (uint64_t)(-(int64_t)rdtsc()); + for (i = 0; i < vm->maxcpus; i++) { + vm->vcpu[i].tsc_offset = tsc_off; + } +#endif /* __FreeBSD__ */ +} + +/* + * The default CPU topology is a single thread per package. + */ +u_int cores_per_package = 1; +u_int threads_per_core = 1; + +int +vm_create(const char *name, struct vm **retvm) +{ + struct vm *vm; + struct vmspace *vmspace; + + /* + * If vmm.ko could not be successfully initialized then don't attempt + * to create the virtual machine. + */ + if (!vmm_initialized) + return (ENXIO); + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (EINVAL); + + vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); + if (vmspace == NULL) + return (ENOMEM); + + vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->vmspace = vmspace; + + vm->sockets = 1; + vm->cores = cores_per_package; /* XXX backwards compatibility */ + vm->threads = threads_per_core; /* XXX backwards compatibility */ + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ + + vm_init(vm, true); + + *retvm = vm; + return (0); +} + +void +vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus) +{ + *sockets = vm->sockets; + *cores = vm->cores; + *threads = vm->threads; + *maxcpus = vm->maxcpus; +} + +uint16_t +vm_get_maxcpus(struct vm *vm) +{ + return (vm->maxcpus); +} + +int +vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus) +{ + if (maxcpus != 0) + return (EINVAL); /* XXX remove when supported */ + if ((sockets * cores * threads) > vm->maxcpus) + return (EINVAL); + /* XXX need to check sockets * cores * threads == vCPU, how? */ + vm->sockets = sockets; + vm->cores = cores; + vm->threads = threads; + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ + return(0); +} + +static void +vm_cleanup(struct vm *vm, bool destroy) +{ + struct mem_map *mm; + int i; + + ppt_unassign_all(vm); + + if (vm->iommu != NULL) + iommu_destroy_domain(vm->iommu); + + if (destroy) + vrtc_cleanup(vm->vrtc); + else + vrtc_reset(vm->vrtc); + vpmtmr_cleanup(vm->vpmtmr); + vatpit_cleanup(vm->vatpit); + vhpet_cleanup(vm->vhpet); + vatpic_cleanup(vm->vatpic); + vioapic_cleanup(vm->vioapic); + + for (i = 0; i < vm->maxcpus; i++) + vcpu_cleanup(vm, i, destroy); + + VMCLEANUP(vm->cookie); + + /* + * System memory is removed from the guest address space only when + * the VM is destroyed. This is because the mapping remains the same + * across VM reset. + * + * Device memory can be relocated by the guest (e.g. using PCI BARs) + * so those mappings are removed on a VM reset. + */ + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (destroy || !sysmem_mapping(vm, mm)) + vm_free_memmap(vm, i); +#ifndef __FreeBSD__ + else { + /* + * We need to reset the IOMMU flag so this mapping can + * be reused when a VM is rebooted. Since the IOMMU + * domain has already been destroyed we can just reset + * the flag here. + */ + mm->flags &= ~VM_MEMMAP_F_IOMMU; + } +#endif + } + + if (destroy) { + for (i = 0; i < VM_MAX_MEMSEGS; i++) + vm_free_memseg(vm, i); + + VMSPACE_FREE(vm->vmspace); + vm->vmspace = NULL; + } +#ifndef __FreeBSD__ + else { + /* + * Clear the first memory segment (low mem), old memory contents + * could confuse the UEFI firmware. + */ + vm_clear_memseg(vm, 0); + } +#endif +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); + free(vm, M_VM); +} + +int +vm_reinit(struct vm *vm) +{ + int error; + + /* + * A virtual machine can be reset only if all vcpus are suspended. + */ + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + vm_cleanup(vm, false); + vm_init(vm, false); + error = 0; + } else { + error = EBUSY; + } + + return (error); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +int +vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + vm_object_t obj; + + if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) + return (ENOMEM); + else + return (0); +} + +int +vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + + vmm_mmio_free(vm->vmspace, gpa, len); + return (0); +} + +/* + * Return 'true' if 'gpa' is allocated in the guest address space. + * + * This function is called in the context of a running vcpu which acts as + * an implicit lock on 'vm->mem_maps[]'. + */ +bool +vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) +{ + struct mem_map *mm; + int i; + +#ifdef INVARIANTS + int hostcpu, state; + state = vcpu_get_state(vm, vcpuid, &hostcpu); + KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, + ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); +#endif + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) + return (true); /* 'gpa' is sysmem or devmem */ + } + + if (ppt_is_mmio(vm, gpa)) + return (true); /* 'gpa' is pci passthru mmio */ + + return (false); +} + +int +vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) +{ + struct mem_seg *seg; + vm_object_t obj; + +#ifndef __FreeBSD__ + extern pgcnt_t get_max_page_get(void); +#endif + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + + if (len == 0 || (len & PAGE_MASK)) + return (EINVAL); + +#ifndef __FreeBSD__ + if (len > ptob(get_max_page_get())) + return (EINVAL); +#endif + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + if (seg->len == len && seg->sysmem == sysmem) + return (EEXIST); + else + return (EINVAL); + } + + obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); + if (obj == NULL) + return (ENOMEM); + + seg->len = len; + seg->object = obj; + seg->sysmem = sysmem; + return (0); +} + +int +vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + vm_object_t *objptr) +{ + struct mem_seg *seg; + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[ident]; + if (len) + *len = seg->len; + if (sysmem) + *sysmem = seg->sysmem; + if (objptr) + *objptr = seg->object; + return (0); +} + +#ifndef __FreeBSD__ +static void +vm_clear_memseg(struct vm *vm, int ident) +{ + struct mem_seg *seg; + + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); + + seg = &vm->mem_segs[ident]; + + if (seg->object != NULL) + vm_object_clear(seg->object); +} +#endif + +void +vm_free_memseg(struct vm *vm, int ident) +{ + struct mem_seg *seg; + + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + vm_object_deallocate(seg->object); + bzero(seg, sizeof(struct mem_seg)); + } +} + +int +vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, + size_t len, int prot, int flags) +{ + struct mem_seg *seg; + struct mem_map *m, *map; + vm_ooffset_t last; + int i, error; + + if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) + return (EINVAL); + + if (flags & ~VM_MEMMAP_F_WIRED) + return (EINVAL); + + if (segid < 0 || segid >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[segid]; + if (seg->object == NULL) + return (EINVAL); + + last = first + len; + if (first < 0 || first >= last || last > seg->len) + return (EINVAL); + + if ((gpa | first | last) & PAGE_MASK) + return (EINVAL); + + map = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->len == 0) { + map = m; + break; + } + } + + if (map == NULL) + return (ENOSPC); + + error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, + len, 0, VMFS_NO_SPACE, prot, prot, 0); + if (error != KERN_SUCCESS) + return (EFAULT); + + vm_object_reference(seg->object); + + if ((flags & VM_MEMMAP_F_WIRED) != 0) { + error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (error != KERN_SUCCESS) { + vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); + return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : + EFAULT); + } + } + + map->gpa = gpa; + map->len = len; + map->segoff = first; + map->segid = segid; + map->prot = prot; + map->flags = flags; + return (0); +} + +int +vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) +{ + struct mem_map *mm, *mmnext; + int i; + + mmnext = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len == 0 || mm->gpa < *gpa) + continue; + if (mmnext == NULL || mm->gpa < mmnext->gpa) + mmnext = mm; + } + + if (mmnext != NULL) { + *gpa = mmnext->gpa; + if (segid) + *segid = mmnext->segid; + if (segoff) + *segoff = mmnext->segoff; + if (len) + *len = mmnext->len; + if (prot) + *prot = mmnext->prot; + if (flags) + *flags = mmnext->flags; + return (0); + } else { + return (ENOENT); + } +} + +static void +vm_free_memmap(struct vm *vm, int ident) +{ + struct mem_map *mm; + int error; + + mm = &vm->mem_maps[ident]; + if (mm->len) { + error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, + mm->gpa + mm->len); + KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", + __func__, error)); + bzero(mm, sizeof(struct mem_map)); + } +} + +static __inline bool +sysmem_mapping(struct vm *vm, struct mem_map *mm) +{ + + if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) + return (true); + else + return (false); +} + +vm_paddr_t +vmm_sysmem_maxaddr(struct vm *vm) +{ + struct mem_map *mm; + vm_paddr_t maxaddr; + int i; + + maxaddr = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm)) { + if (maxaddr < mm->gpa + mm->len) + maxaddr = mm->gpa + mm->len; + } + } + return (maxaddr); +} + +static void +vm_iommu_modify(struct vm *vm, boolean_t map) +{ + int i, sz; + vm_paddr_t gpa, hpa; + struct mem_map *mm; +#ifdef __FreeBSD__ + void *vp, *cookie, *host_domain; +#else + void *vp, *cookie, *host_domain __unused; +#endif + + sz = PAGE_SIZE; + host_domain = iommu_host_domain(); + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (!sysmem_mapping(vm, mm)) + continue; + + if (map) { + KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, + ("iommu map found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) + continue; + mm->flags |= VM_MEMMAP_F_IOMMU; + } else { + if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) + continue; + mm->flags &= ~VM_MEMMAP_F_IOMMU; + KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, + ("iommu unmap found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + } + + gpa = mm->gpa; + while (gpa < mm->gpa + mm->len) { + vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE, + &cookie); + KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", + vm_name(vm), gpa)); + + vm_gpa_release(cookie); + + hpa = DMAP_TO_PHYS((uintptr_t)vp); + if (map) { + iommu_create_mapping(vm->iommu, gpa, hpa, sz); +#ifdef __FreeBSD__ + iommu_remove_mapping(host_domain, hpa, sz); +#endif + } else { + iommu_remove_mapping(vm->iommu, gpa, sz); +#ifdef __FreeBSD__ + iommu_create_mapping(host_domain, hpa, hpa, sz); +#endif + } + + gpa += PAGE_SIZE; + } + } + + /* + * Invalidate the cached translations associated with the domain + * from which pages were removed. + */ +#ifdef __FreeBSD__ + if (map) + iommu_invalidate_tlb(host_domain); + else + iommu_invalidate_tlb(vm->iommu); +#else + iommu_invalidate_tlb(vm->iommu); +#endif +} + +#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) +#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) + +#ifdef __FreeBSD__ +int +vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) +#else +int +vm_unassign_pptdev(struct vm *vm, int pptfd) +#endif /* __FreeBSD__ */ +{ + int error; + +#ifdef __FreeBSD__ + error = ppt_unassign_device(vm, bus, slot, func); +#else + error = ppt_unassign_device(vm, pptfd); +#endif /* __FreeBSD__ */ + if (error) + return (error); + + if (ppt_assigned_devices(vm) == 0) + vm_iommu_unmap(vm); + + return (0); +} + +#ifdef __FreeBSD__ +int +vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) +#else +int +vm_assign_pptdev(struct vm *vm, int pptfd) +#endif /* __FreeBSD__ */ +{ + int error; + vm_paddr_t maxaddr; + + /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ + if (ppt_assigned_devices(vm) == 0) { + KASSERT(vm->iommu == NULL, + ("vm_assign_pptdev: iommu must be NULL")); + maxaddr = vmm_sysmem_maxaddr(vm); + vm->iommu = iommu_create_domain(maxaddr); + if (vm->iommu == NULL) + return (ENXIO); + vm_iommu_map(vm); + } + +#ifdef __FreeBSD__ + error = ppt_assign_device(vm, bus, slot, func); +#else + error = ppt_assign_device(vm, pptfd); +#endif /* __FreeBSD__ */ + return (error); +} + +void * +vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ + int i, count, pageoff; + struct mem_map *mm; + vm_page_t m; +#ifdef INVARIANTS + /* + * All vcpus are frozen by ioctls that modify the memory map + * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is + * guaranteed if at least one vcpu is in the VCPU_FROZEN state. + */ + int state; + KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d", + __func__, vcpuid)); + for (i = 0; i < vm->maxcpus; i++) { + if (vcpuid != -1 && vcpuid != i) + continue; + state = vcpu_get_state(vm, i, NULL); + KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", + __func__, state)); + } +#endif + pageoff = gpa & PAGE_MASK; + if (len > PAGE_SIZE - pageoff) + panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); + + count = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm) && gpa >= mm->gpa && + gpa < mm->gpa + mm->len) { + count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, + trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + break; + } + } + + if (count == 1) { + *cookie = m; + return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); + } else { + *cookie = NULL; + return (NULL); + } +} + +void +vm_gpa_release(void *cookie) +{ + vm_page_t m = cookie; + + vm_page_lock(m); + vm_page_unhold(m); + vm_page_unlock(m); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) +{ + struct vcpu *vcpu; + int error; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + error = VMSETREG(vm->cookie, vcpuid, reg, val); + if (error || reg != VM_REG_GUEST_RIP) + return (error); + + /* Set 'nextrip' to match the value of %rip */ + VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); + vcpu = &vm->vcpu[vcpuid]; + vcpu->nextrip = val; + return (0); +} + +static boolean_t +is_descriptor_table(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_IDTR: + case VM_REG_GUEST_GDTR: + return (TRUE); + default: + return (FALSE); + } +} + +static boolean_t +is_segment_register(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_ES: + case VM_REG_GUEST_CS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_TR: + case VM_REG_GUEST_LDTR: + return (TRUE); + default: + return (FALSE); + } +} + +int +vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMGETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMSETDESC(vm->cookie, vcpu, reg, desc)); +} + +static void +restore_guest_fpustate(struct vcpu *vcpu) +{ + + /* flush host state to the pcb */ + fpuexit(curthread); + + /* restore guest FPU state */ + fpu_stop_emulating(); + fpurestore(vcpu->guestfpu); + + /* restore guest XCR0 if XSAVE is enabled in the host */ + if (rcr4() & CR4_XSAVE) + load_xcr(0, vcpu->guest_xcr0); + + /* + * The FPU is now "dirty" with the guest's state so turn on emulation + * to trap any access to the FPU by the host. + */ + fpu_start_emulating(); +} + +static void +save_guest_fpustate(struct vcpu *vcpu) +{ + + if ((rcr0() & CR0_TS) == 0) + panic("fpu emulation not enabled in host!"); + + /* save guest XCR0 and restore host XCR0 */ + if (rcr4() & CR4_XSAVE) { + vcpu->guest_xcr0 = rxcr(0); + load_xcr(0, vmm_get_host_xcr0()); + } + + /* save guest FPU state */ + fpu_stop_emulating(); + fpusave(vcpu->guestfpu); +#ifdef __FreeBSD__ + fpu_start_emulating(); +#else + /* + * When the host state has been restored, we should not re-enable + * CR0.TS on illumos for eager FPU. + */ +#endif +} + +static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); + +static int +vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + struct vcpu *vcpu; + int error; + + vcpu = &vm->vcpu[vcpuid]; + vcpu_assert_locked(vcpu); + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) { + vcpu->reqidle = 1; + vcpu_notify_event_locked(vcpu, false); + VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " + "idle requested", vcpu_state2str(vcpu->state)); +#ifdef __FreeBSD__ + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); +#else + cv_wait(&vcpu->state_cv, &vcpu->mtx.m); +#endif + } + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + if (vcpu->state == VCPU_RUNNING) { + KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " + "mismatch for running vcpu", curcpu, vcpu->hostcpu)); + } else { + KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " + "vcpu that is not running", vcpu->hostcpu)); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (newstate == VCPU_RUNNING) { + while (vcpu->runblock != 0) { +#ifdef __FreeBSD__ + msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); +#else + cv_wait(&vcpu->state_cv, &vcpu->mtx.m); +#endif + } + } + + if (error) + return (EBUSY); + + VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", + vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); + + vcpu->state = newstate; + if (newstate == VCPU_RUNNING) + vcpu->hostcpu = curcpu; + else + vcpu->hostcpu = NOCPU; + + if (newstate == VCPU_IDLE || + (newstate == VCPU_FROZEN && vcpu->runblock != 0)) { +#ifdef __FreeBSD__ + wakeup(&vcpu->state); +#else + cv_broadcast(&vcpu->state_cv); +#endif + } + + return (0); +} + +static void +vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) + panic("Error %d setting state to %d\n", error, newstate); +} + +static void +vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) + panic("Error %d setting state to %d", error, newstate); +} + +/* + * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. + */ +static int +vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) +{ + struct vcpu *vcpu; +#ifdef __FreeBSD__ + const char *wmesg; +#else + const char *wmesg __unused; +#endif + int t, vcpu_halted, vm_halted; + + KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_halted = 0; + vm_halted = 0; + + vcpu_lock(vcpu); + while (1) { + /* + * Do a final check for pending NMI or interrupts before + * really putting this thread to sleep. Also check for + * software events that would cause this vcpu to wakeup. + * + * These interrupts/events could have happened after the + * vcpu returned from VMRUN() and before it acquired the + * vcpu lock above. + */ + if (vm->suspend || vcpu->reqidle) + break; + if (vm_nmi_pending(vm, vcpuid)) + break; + if (!intr_disabled) { + if (vm_extint_pending(vm, vcpuid) || + vlapic_pending_intr(vcpu->vlapic, NULL)) { + break; + } + } + + /* Don't go to sleep if the vcpu thread needs to yield */ + if (vcpu_should_yield(vm, vcpuid)) + break; + + if (vcpu_debugged(vm, vcpuid)) + break; + + /* + * Some Linux guests implement "halt" by having all vcpus + * execute HLT with interrupts disabled. 'halted_cpus' keeps + * track of the vcpus that have entered this state. When all + * vcpus enter the halted state the virtual machine is halted. + */ + if (intr_disabled) { + wmesg = "vmhalt"; + VCPU_CTR0(vm, vcpuid, "Halted"); + if (!vcpu_halted && halt_detection_enabled) { + vcpu_halted = 1; + CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); + } + if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { + vm_halted = 1; + break; + } + } else { + wmesg = "vmidle"; + } + + t = ticks; + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); +#ifdef __FreeBSD__ + /* + * XXX msleep_spin() cannot be interrupted by signals so + * wake up periodically to check pending signals. + */ + msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); +#else + /* + * Fortunately, cv_wait_sig can be interrupted by signals, so + * there is no need to periodically wake up. + */ + (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m); +#endif + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); + } + + if (vcpu_halted) + CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); + + vcpu_unlock(vcpu); + + if (vm_halted) + vm_suspend(vm, VM_SUSPEND_HALT); + + return (0); +} + +static int +vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) +{ + int rv, ftype; + struct vm_map *map; + struct vcpu *vcpu; + struct vm_exit *vme; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + + ftype = vme->u.paging.fault_type; + KASSERT(ftype == VM_PROT_READ || + ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, + ("vm_handle_paging: invalid fault_type %d", ftype)); + + if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { + rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), + vme->u.paging.gpa, ftype); + if (rv == 0) { + VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", + ftype == VM_PROT_READ ? "accessed" : "dirty", + vme->u.paging.gpa); + goto done; + } + } + + map = &vm->vmspace->vm_map; + rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); + + VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " + "ftype = %d", rv, vme->u.paging.gpa, ftype); + + if (rv != KERN_SUCCESS) + return (EFAULT); +done: + return (0); +} + +static int +vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) +{ + struct vie *vie; + struct vcpu *vcpu; + struct vm_exit *vme; + uint64_t gla, gpa, cs_base; + struct vm_guest_paging *paging; + mem_region_read_t mread; + mem_region_write_t mwrite; + enum vm_cpu_mode cpu_mode; + int cs_d, error, fault; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + + gla = vme->u.inst_emul.gla; + gpa = vme->u.inst_emul.gpa; + cs_base = vme->u.inst_emul.cs_base; + cs_d = vme->u.inst_emul.cs_d; + vie = &vme->u.inst_emul.vie; + paging = &vme->u.inst_emul.paging; + cpu_mode = paging->cpu_mode; + + VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); + + /* Fetch, decode and emulate the faulting instruction */ + if (vie->num_valid == 0) { + error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + + cs_base, VIE_INST_SIZE, vie, &fault); + } else { + /* + * The instruction bytes have already been copied into 'vie' + */ + error = fault = 0; + } + if (error || fault) + return (error); + + if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { + VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", + vme->rip + cs_base); + *retu = true; /* dump instruction bytes in userspace */ + return (0); + } + + /* + * Update 'nextrip' based on the length of the emulated instruction. + */ + vme->inst_length = vie->num_processed; + vcpu->nextrip += vie->num_processed; + VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " + "decoding", vcpu->nextrip); + + /* return to userland unless this is an in-kernel emulated device */ + if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { + mread = lapic_mmio_read; + mwrite = lapic_mmio_write; + } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { + mread = vioapic_mmio_read; + mwrite = vioapic_mmio_write; + } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { + mread = vhpet_mmio_read; + mwrite = vhpet_mmio_write; + } else { + *retu = true; + return (0); + } + + error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, + mread, mwrite, retu); + + return (error); +} + +static int +vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) +{ +#ifdef __FreeBSD__ + int i, done; + struct vcpu *vcpu; + + done = 0; +#else + int i; + struct vcpu *vcpu; +#endif + vcpu = &vm->vcpu[vcpuid]; + + CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); + + /* + * Wait until all 'active_cpus' have suspended themselves. + */ + vcpu_lock(vcpu); + while (1) { + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); + break; + } + + VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); +#ifdef __FreeBSD__ + msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); +#else + /* + * To prevent vm_handle_suspend from becoming stuck in the + * kernel if the bhyve process driving its vCPUs is killed, + * offer a bail-out, even though not all the vCPUs have reached + * the suspended state. + */ + if (cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, + hz, TR_CLOCK_TICK) <= 0) { + if ((curproc->p_flag & SEXITING) != 0) { + vcpu_require_state_locked(vm, vcpuid, + VCPU_FROZEN); + break; + } + } +#endif + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + } + vcpu_unlock(vcpu); + + /* + * Wakeup the other sleeping vcpus and return to userspace. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->suspended_cpus)) { + vcpu_notify_event(vm, i, false); + } + } + + *retu = true; + return (0); +} + +static int +vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); + vcpu->reqidle = 0; + vcpu_unlock(vcpu); + *retu = true; + return (0); +} + +#ifndef __FreeBSD__ +static int +vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) +{ + struct vcpu *cpu = &vm->vcpu[vcpuid]; + const uint32_t code = vme->u.msr.code; + const uint64_t val = vme->u.msr.wval; + + switch (code) { + case MSR_TSC: + cpu->tsc_offset = val - rdtsc(); + return (0); + } + + return (-1); +} +#endif /* __FreeBSD__ */ + +int +vm_suspend(struct vm *vm, enum vm_suspend_how how) +{ + int i; + + if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) + return (EINVAL); + + if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { + VM_CTR2(vm, "virtual machine already suspended %d/%d", + vm->suspend, how); + return (EALREADY); + } + + VM_CTR1(vm, "virtual machine successfully suspended %d", how); + + /* + * Notify all active vcpus that they are now suspended. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm, i, false); + } + + return (0); +} + +void +vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, + ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_SUSPENDED; + vmexit->u.suspended.how = vm->suspend; +} + +void +vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_DEBUG; +} + +void +vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_RUNBLOCK; + vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1); +} + +void +vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_REQIDLE; + vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); +} + +void +vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); +} + +#ifndef __FreeBSD__ +/* + * Some vmm resources, such as the lapic, may have CPU-specific resources + * allocated to them which would benefit from migration onto the host CPU which + * is processing the vcpu state. + */ +static void +vm_localize_resources(struct vm *vm, struct vcpu *vcpu) +{ + /* + * Localizing cyclic resources requires acquisition of cpu_lock, and + * doing so with kpreempt disabled is a recipe for deadlock disaster. + */ + VERIFY(curthread->t_preempt == 0); + + /* + * Do not bother with localization if this vCPU is about to return to + * the host CPU it was last localized to. + */ + if (vcpu->lastloccpu == curcpu) + return; + + /* + * Localize system-wide resources to the primary boot vCPU. While any + * of the other vCPUs may access them, it keeps the potential interrupt + * footprint constrained to CPUs involved with this instance. + */ + if (vcpu == &vm->vcpu[0]) { + vhpet_localize_resources(vm->vhpet); + vrtc_localize_resources(vm->vrtc); + vatpit_localize_resources(vm->vatpit); + } + + vlapic_localize_resources(vcpu->vlapic); + + vcpu->lastloccpu = curcpu; +} + +static void +vmm_savectx(void *arg) +{ + vm_thread_ctx_t *vtc = arg; + struct vm *vm = vtc->vtc_vm; + const int vcpuid = vtc->vtc_vcpuid; + + if (ops->vmsavectx != NULL) { + ops->vmsavectx(vm->cookie, vcpuid); + } + + /* + * If the CPU holds the restored guest FPU state, save it and restore + * the host FPU state before this thread goes off-cpu. + */ + if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + save_guest_fpustate(vcpu); + vtc->vtc_status &= ~VTCS_FPU_RESTORED; + } +} + +static void +vmm_restorectx(void *arg) +{ + vm_thread_ctx_t *vtc = arg; + struct vm *vm = vtc->vtc_vm; + const int vcpuid = vtc->vtc_vcpuid; + + /* + * When coming back on-cpu, only restore the guest FPU status if the + * thread is in a context marked as requiring it. This should be rare, + * occurring only when a future logic error results in a voluntary + * sleep during the VMRUN critical section. + * + * The common case will result in elision of the guest FPU state + * restoration, deferring that action until it is clearly necessary + * during vm_run. + */ + VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); + if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + restore_guest_fpustate(vcpu); + vtc->vtc_status |= VTCS_FPU_RESTORED; + } + + if (ops->vmrestorectx != NULL) { + ops->vmrestorectx(vm->cookie, vcpuid); + } + +} + +/* + * If we're in removectx(), we might still have state to tidy up. + */ +static void +vmm_freectx(void *arg, int isexec) +{ + vmm_savectx(arg); +} + +#endif /* __FreeBSD */ + +int +vm_run(struct vm *vm, struct vm_run *vmrun) +{ + struct vm_eventinfo evinfo; + int error, vcpuid; + struct vcpu *vcpu; +#ifdef __FreeBSD__ + struct pcb *pcb; +#endif + uint64_t tscval; + struct vm_exit *vme; + bool retu, intr_disabled; + pmap_t pmap; +#ifndef __FreeBSD__ + vm_thread_ctx_t vtc; + int affinity_type = CPU_CURRENT; +#endif + + vcpuid = vmrun->cpuid; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) + return (EINVAL); + + pmap = vmspace_pmap(vm->vmspace); + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + evinfo.rptr = &vcpu->runblock; + evinfo.sptr = &vm->suspend; + evinfo.iptr = &vcpu->reqidle; + +#ifndef __FreeBSD__ + vtc.vtc_vm = vm; + vtc.vtc_vcpuid = vcpuid; + vtc.vtc_status = 0; + + installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL, + NULL, vmm_freectx); +#endif + +restart: +#ifndef __FreeBSD__ + thread_affinity_set(curthread, affinity_type); + /* + * Resource localization should happen after the CPU affinity for the + * thread has been set to ensure that access from restricted contexts, + * such as VMX-accelerated APIC operations, can occur without inducing + * cyclic cross-calls. + * + * This must be done prior to disabling kpreempt via critical_enter(). + */ + vm_localize_resources(vm, vcpu); + + affinity_type = CPU_CURRENT; +#endif + + critical_enter(); + + KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), + ("vm_run: absurd pm_active")); + + tscval = rdtsc(); + +#ifdef __FreeBSD__ + pcb = PCPU_GET(curpcb); + set_pcb_flags(pcb, PCB_FULL_IRET); +#else + /* Force a trip through update_sregs to reload %fs/%gs and friends */ + PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); +#endif + +#ifdef __FreeBSD__ + restore_guest_fpustate(vcpu); +#else + if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { + restore_guest_fpustate(vcpu); + vtc.vtc_status |= VTCS_FPU_RESTORED; + } + vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; +#endif + + vcpu_require_state(vm, vcpuid, VCPU_RUNNING); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); + vcpu_require_state(vm, vcpuid, VCPU_FROZEN); + +#ifdef __FreeBSD__ + save_guest_fpustate(vcpu); +#else + vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; +#endif + +#ifndef __FreeBSD__ + /* + * Once clear of the delicate contexts comprising the VM_RUN handler, + * thread CPU affinity can be loosened while other processing occurs. + */ + thread_affinity_clear(curthread); +#endif + + vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); + + critical_exit(); + + if (error == 0) { + retu = false; + vcpu->nextrip = vme->rip + vme->inst_length; + switch (vme->exitcode) { + case VM_EXITCODE_REQIDLE: + error = vm_handle_reqidle(vm, vcpuid, &retu); + break; + case VM_EXITCODE_SUSPENDED: + error = vm_handle_suspend(vm, vcpuid, &retu); + break; + case VM_EXITCODE_IOAPIC_EOI: + vioapic_process_eoi(vm, vcpuid, + vme->u.ioapic_eoi.vector); + break; + case VM_EXITCODE_RUNBLOCK: + break; + case VM_EXITCODE_HLT: + intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); + error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); + break; + case VM_EXITCODE_PAGING: + error = vm_handle_paging(vm, vcpuid, &retu); + break; + case VM_EXITCODE_INST_EMUL: + error = vm_handle_inst_emul(vm, vcpuid, &retu); + break; + case VM_EXITCODE_INOUT: + case VM_EXITCODE_INOUT_STR: + error = vm_handle_inout(vm, vcpuid, vme, &retu); + break; + case VM_EXITCODE_MONITOR: + case VM_EXITCODE_MWAIT: + case VM_EXITCODE_VMINSN: + vm_inject_ud(vm, vcpuid); + break; +#ifndef __FreeBSD__ + case VM_EXITCODE_WRMSR: + if (vm_handle_wrmsr(vm, vcpuid, vme) != 0) { + retu = true; + } + break; + + case VM_EXITCODE_HT: { + affinity_type = CPU_BEST; + break; + } + + case VM_EXITCODE_MTRAP: + vm_suspend_cpu(vm, vcpuid); + retu = true; + break; +#endif + default: + retu = true; /* handled in userland */ + break; + } + } + + if (error == 0 && retu == false) + goto restart; + +#ifndef __FreeBSD__ + removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL, + NULL, vmm_freectx); +#endif + + VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); + + /* copy the exit information */ + bcopy(vme, &vmrun->vm_exit, sizeof (struct vm_exit)); + return (error); +} + +int +vm_restart_instruction(void *arg, int vcpuid) +{ + struct vm *vm; + struct vcpu *vcpu; + enum vcpu_state state; + uint64_t rip; + int error; + + vm = arg; + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + state = vcpu_get_state(vm, vcpuid, NULL); + if (state == VCPU_RUNNING) { + /* + * When a vcpu is "running" the next instruction is determined + * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. + * Thus setting 'inst_length' to zero will cause the current + * instruction to be restarted. + */ + vcpu->exitinfo.inst_length = 0; + VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " + "setting inst_length to zero", vcpu->exitinfo.rip); + } else if (state == VCPU_FROZEN) { + /* + * When a vcpu is "frozen" it is outside the critical section + * around VMRUN() and 'nextrip' points to the next instruction. + * Thus instruction restart is achieved by setting 'nextrip' + * to the vcpu's %rip. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); + KASSERT(!error, ("%s: error %d getting rip", __func__, error)); + VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " + "nextrip from %#lx to %#lx", vcpu->nextrip, rip); + vcpu->nextrip = rip; + } else { + panic("%s: invalid state %d", __func__, state); + } + return (0); +} + +int +vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) +{ + struct vcpu *vcpu; + int type, vector; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + if (type == VM_INTINFO_NMI && vector != IDT_NMI) + return (EINVAL); + if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) + return (EINVAL); + if (info & VM_INTINFO_RSVD) + return (EINVAL); + } else { + info = 0; + } + VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); + vcpu->exitintinfo = info; + return (0); +} + +enum exc_class { + EXC_BENIGN, + EXC_CONTRIBUTORY, + EXC_PAGEFAULT +}; + +#define IDT_VE 20 /* Virtualization Exception (Intel specific) */ + +static enum exc_class +exception_class(uint64_t info) +{ + int type, vector; + +#ifdef __FreeBSD__ + KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); +#else + KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info)); +#endif + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + + /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ + switch (type) { + case VM_INTINFO_HWINTR: + case VM_INTINFO_SWINTR: + case VM_INTINFO_NMI: + return (EXC_BENIGN); + default: + /* + * Hardware exception. + * + * SVM and VT-x use identical type values to represent NMI, + * hardware interrupt and software interrupt. + * + * SVM uses type '3' for all exceptions. VT-x uses type '3' + * for exceptions except #BP and #OF. #BP and #OF use a type + * value of '5' or '6'. Therefore we don't check for explicit + * values of 'type' to classify 'intinfo' into a hardware + * exception. + */ + break; + } + + switch (vector) { + case IDT_PF: + case IDT_VE: + return (EXC_PAGEFAULT); + case IDT_DE: + case IDT_TS: + case IDT_NP: + case IDT_SS: + case IDT_GP: + return (EXC_CONTRIBUTORY); + default: + return (EXC_BENIGN); + } +} + +static int +nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, + uint64_t *retinfo) +{ + enum exc_class exc1, exc2; + int type1, vector1; + +#ifdef __FreeBSD__ + KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); + KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); +#else + KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1)); + KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2)); +#endif + + /* + * If an exception occurs while attempting to call the double-fault + * handler the processor enters shutdown mode (aka triple fault). + */ + type1 = info1 & VM_INTINFO_TYPE; + vector1 = info1 & 0xff; + if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { + VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", + info1, info2); + vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); + *retinfo = 0; + return (0); + } + + /* + * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 + */ + exc1 = exception_class(info1); + exc2 = exception_class(info2); + if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || + (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { + /* Convert nested fault into a double fault. */ + *retinfo = IDT_DF; + *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + *retinfo |= VM_INTINFO_DEL_ERRCODE; + } else { + /* Handle exceptions serially */ + *retinfo = info2; + } + return (1); +} + +static uint64_t +vcpu_exception_intinfo(struct vcpu *vcpu) +{ + uint64_t info = 0; + + if (vcpu->exception_pending) { + info = vcpu->exc_vector & 0xff; + info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + if (vcpu->exc_errcode_valid) { + info |= VM_INTINFO_DEL_ERRCODE; + info |= (uint64_t)vcpu->exc_errcode << 32; + } + } + return (info); +} + +int +vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) +{ + struct vcpu *vcpu; + uint64_t info1, info2; + int valid; + + KASSERT(vcpuid >= 0 && + vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid)); + + vcpu = &vm->vcpu[vcpuid]; + + info1 = vcpu->exitintinfo; + vcpu->exitintinfo = 0; + + info2 = 0; + if (vcpu->exception_pending) { + info2 = vcpu_exception_intinfo(vcpu); + vcpu->exception_pending = 0; + VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", + vcpu->exc_vector, info2); + } + + if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { + valid = nested_fault(vm, vcpuid, info1, info2, retinfo); + } else if (info1 & VM_INTINFO_VALID) { + *retinfo = info1; + valid = 1; + } else if (info2 & VM_INTINFO_VALID) { + *retinfo = info2; + valid = 1; + } else { + valid = 0; + } + + if (valid) { + VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " + "retinfo(%#lx)", __func__, info1, info2, *retinfo); + } + + return (valid); +} + +int +vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + *info1 = vcpu->exitintinfo; + *info2 = vcpu_exception_intinfo(vcpu); + return (0); +} + +int +vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) +{ + struct vcpu *vcpu; + uint64_t regval; + int error; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vector < 0 || vector >= 32) + return (EINVAL); + + /* + * A double fault exception should never be injected directly into + * the guest. It is a derived exception that results from specific + * combinations of nested faults. + */ + if (vector == IDT_DF) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->exception_pending) { + VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " + "pending exception %d", vector, vcpu->exc_vector); + return (EBUSY); + } + + if (errcode_valid) { + /* + * Exceptions don't deliver an error code in real mode. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); + KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); + if (!(regval & CR0_PE)) + errcode_valid = 0; + } + + /* + * From section 26.6.1 "Interruptibility State" in Intel SDM: + * + * Event blocking by "STI" or "MOV SS" is cleared after guest executes + * one instruction or incurs an exception. + */ + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); + KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", + __func__, error)); + + if (restart_instruction) + vm_restart_instruction(vm, vcpuid); + + vcpu->exception_pending = 1; + vcpu->exc_vector = vector; + vcpu->exc_errcode = errcode; + vcpu->exc_errcode_valid = errcode_valid; + VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); + return (0); +} + +void +vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, + int errcode) +{ + struct vm *vm; + int error, restart_instruction; + + vm = vmarg; + restart_instruction = 1; + + error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, + errcode, restart_instruction); + KASSERT(error == 0, ("vm_inject_exception error %d", error)); +} + +void +vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) +{ + struct vm *vm; + int error; + + vm = vmarg; + VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", + error_code, cr2); + + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); + KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); + + vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); +} + +static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); + +int +vm_inject_nmi(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu->nmi_pending = 1; + vcpu_notify_event(vm, vcpuid, false); + return (0); +} + +int +vm_nmi_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->nmi_pending); +} + +void +vm_nmi_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->nmi_pending == 0) + panic("vm_nmi_clear: inconsistent nmi_pending state"); + + vcpu->nmi_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); +} + +static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); + +int +vm_inject_extint(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu->extint_pending = 1; + vcpu_notify_event(vm, vcpuid, false); + return (0); +} + +int +vm_extint_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_extint_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->extint_pending); +} + +void +vm_extint_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_extint_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->extint_pending == 0) + panic("vm_extint_clear: inconsistent extint_pending state"); + + vcpu->extint_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); +} + +int +vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) +{ + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMGETCAP(vm->cookie, vcpu, type, retval)); +} + +int +vm_set_capability(struct vm *vm, int vcpu, int type, int val) +{ + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMSETCAP(vm->cookie, vcpu, type, val)); +} + +struct vlapic * +vm_lapic(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].vlapic); +} + +struct vioapic * +vm_ioapic(struct vm *vm) +{ + + return (vm->vioapic); +} + +struct vhpet * +vm_hpet(struct vm *vm) +{ + + return (vm->vhpet); +} + +#ifdef __FreeBSD__ +boolean_t +vmm_is_pptdev(int bus, int slot, int func) +{ + int found, i, n; + int b, s, f; + char *val, *cp, *cp2; + + /* + * XXX + * The length of an environment variable is limited to 128 bytes which + * puts an upper limit on the number of passthru devices that may be + * specified using a single environment variable. + * + * Work around this by scanning multiple environment variable + * names instead of a single one - yuck! + */ + const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; + + /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ + found = 0; + for (i = 0; names[i] != NULL && !found; i++) { + cp = val = kern_getenv(names[i]); + while (cp != NULL && *cp != '\0') { + if ((cp2 = strchr(cp, ' ')) != NULL) + *cp2 = '\0'; + + n = sscanf(cp, "%d/%d/%d", &b, &s, &f); + if (n == 3 && bus == b && slot == s && func == f) { + found = 1; + break; + } + + if (cp2 != NULL) + *cp2++ = ' '; + + cp = cp2; + } + freeenv(val); + } + return (found); +} +#endif + +void * +vm_iommu_domain(struct vm *vm) +{ + + return (vm->iommu); +} + +int +vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + int error; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); + vcpu_unlock(vcpu); + + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) +{ + struct vcpu *vcpu; + enum vcpu_state state; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + state = vcpu->state; + if (hostcpu != NULL) + *hostcpu = vcpu->hostcpu; + vcpu_unlock(vcpu); + + return (state); +} + +void +vcpu_block_run(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vcpu_block_run: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu->runblock++; + if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) { + vcpu_notify_event_locked(vcpu, false); + } + while (vcpu->state == VCPU_RUNNING) { +#ifdef __FreeBSD__ + msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); +#else + cv_wait(&vcpu->state_cv, &vcpu->mtx.m); +#endif + } + vcpu_unlock(vcpu); +} + +void +vcpu_unblock_run(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vcpu_block_run: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + KASSERT(vcpu->runblock != 0, ("expected non-zero runblock")); + vcpu->runblock--; + if (vcpu->runblock == 0) { +#ifdef __FreeBSD__ + wakeup(&vcpu->state); +#else + cv_broadcast(&vcpu->state_cv); +#endif + } + vcpu_unlock(vcpu); +} + +#ifndef __FreeBSD__ +uint64_t +vcpu_tsc_offset(struct vm *vm, int vcpuid) +{ + return (vm->vcpu[vcpuid].tsc_offset); +} +#endif /* __FreeBSD__ */ + +int +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EBUSY); + + VCPU_CTR0(vm, vcpuid, "activated"); + CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); + return (0); +} + +int +vm_suspend_cpu(struct vm *vm, int vcpuid) +{ + int i; + + if (vcpuid < -1 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vcpuid == -1) { + vm->debug_cpus = vm->active_cpus; + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm, i, false); + } + } else { + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); + vcpu_notify_event(vm, vcpuid, false); + } + return (0); +} + +int +vm_resume_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < -1 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vcpuid == -1) { + CPU_ZERO(&vm->debug_cpus); + } else { + if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) + return (EINVAL); + + CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); + } + return (0); +} + +int +vcpu_debugged(struct vm *vm, int vcpuid) +{ + + return (CPU_ISSET(vcpuid, &vm->debug_cpus)); +} + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +cpuset_t +vm_debug_cpus(struct vm *vm) +{ + + return (vm->debug_cpus); +} + +cpuset_t +vm_suspended_cpus(struct vm *vm) +{ + + return (vm->suspended_cpus); +} + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} + +int +vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) +{ + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + *state = vm->vcpu[vcpuid].x2apic_state; + + return (0); +} + +int +vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (state >= X2APIC_STATE_LAST) + return (EINVAL); + + vm->vcpu[vcpuid].x2apic_state = state; + + vlapic_set_x2apic_state(vm, vcpuid, state); + + return (0); +} + +/* + * This function is called to ensure that a vcpu "sees" a pending event + * as soon as possible: + * - If the vcpu thread is sleeping then it is woken up. + * - If the vcpu is running on a different host_cpu then an IPI will be directed + * to the host_cpu to cause the vcpu to trap into the hypervisor. + */ +static void +vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) +{ + int hostcpu; + + hostcpu = vcpu->hostcpu; + if (vcpu->state == VCPU_RUNNING) { + KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); + if (hostcpu != curcpu) { + if (lapic_intr) { + vlapic_post_intr(vcpu->vlapic, hostcpu, + vmm_ipinum); + } else { + ipi_cpu(hostcpu, vmm_ipinum); + } + } else { + /* + * If the 'vcpu' is running on 'curcpu' then it must + * be sending a notification to itself (e.g. SELF_IPI). + * The pending event will be picked up when the vcpu + * transitions back to guest context. + */ + } + } else { + KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " + "with hostcpu %d", vcpu->state, hostcpu)); + if (vcpu->state == VCPU_SLEEPING) { +#ifdef __FreeBSD__ + wakeup_one(vcpu); +#else + cv_signal(&vcpu->vcpu_cv); +#endif + } + } +} + +void +vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu_notify_event_locked(vcpu, lapic_intr); + vcpu_unlock(vcpu); +} + +struct vmspace * +vm_get_vmspace(struct vm *vm) +{ + + return (vm->vmspace); +} + +int +vm_apicid2vcpuid(struct vm *vm, int apicid) +{ + /* + * XXX apic id is assumed to be numerically identical to vcpu id + */ + return (apicid); +} + +struct vatpic * +vm_atpic(struct vm *vm) +{ + return (vm->vatpic); +} + +struct vatpit * +vm_atpit(struct vm *vm) +{ + return (vm->vatpit); +} + +struct vpmtmr * +vm_pmtmr(struct vm *vm) +{ + + return (vm->vpmtmr); +} + +struct vrtc * +vm_rtc(struct vm *vm) +{ + + return (vm->vrtc); +} + +enum vm_reg_name +vm_segment_name(int seg) +{ + static enum vm_reg_name seg_names[] = { + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS + }; + + KASSERT(seg >= 0 && seg < nitems(seg_names), + ("%s: invalid segment encoding %d", __func__, seg)); + return (seg_names[seg]); +} + +void +vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo) +{ + int idx; + + for (idx = 0; idx < num_copyinfo; idx++) { + if (copyinfo[idx].cookie != NULL) + vm_gpa_release(copyinfo[idx].cookie); + } + bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); +} + +int +vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo, int *fault) +{ + int error, idx, nused; + size_t n, off, remaining; + void *hva, *cookie; + uint64_t gpa; + + bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); + + nused = 0; + remaining = len; + while (remaining > 0) { + KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); + error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); + if (error || *fault) + return (error); + off = gpa & PAGE_MASK; + n = min(remaining, PAGE_SIZE - off); + copyinfo[nused].gpa = gpa; + copyinfo[nused].len = n; + remaining -= n; + gla += n; + nused++; + } + + for (idx = 0; idx < nused; idx++) { + hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa, + copyinfo[idx].len, prot, &cookie); + if (hva == NULL) + break; + copyinfo[idx].hva = hva; + copyinfo[idx].cookie = cookie; + } + + if (idx != nused) { + vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); + return (EFAULT); + } else { + *fault = 0; + return (0); + } +} + +void +vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, + size_t len) +{ + char *dst; + int idx; + + dst = kaddr; + idx = 0; + while (len > 0) { + bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); + len -= copyinfo[idx].len; + dst += copyinfo[idx].len; + idx++; + } +} + +void +vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len) +{ + const char *src; + int idx; + + src = kaddr; + idx = 0; + while (len > 0) { + bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); + len -= copyinfo[idx].len; + src += copyinfo[idx].len; + idx++; + } +} + +/* + * Return the amount of in-use and wired memory for the VM. Since + * these are global stats, only return the values with for vCPU 0 + */ +VMM_STAT_DECLARE(VMM_MEM_RESIDENT); +VMM_STAT_DECLARE(VMM_MEM_WIRED); + +static void +vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +{ + + if (vcpu == 0) { + vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, + PAGE_SIZE * vmspace_resident_count(vm->vmspace)); + } +} + +static void +vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +{ + + if (vcpu == 0) { + vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, + PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); + } +} + +VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); +VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); + +#ifndef __FreeBSD__ +int +vm_ioport_hook(struct vm *vm, uint_t ioport, vmm_rmem_cb_t rfunc, + vmm_wmem_cb_t wfunc, void *arg, void **cookie) +{ + list_t *ih = &vm->ioport_hooks; + vm_ioport_hook_t *hook, *node; + + if (ioport == 0) { + return (EINVAL); + } + + /* + * Find the node position in the list which this region should be + * inserted behind to maintain sorted order. + */ + for (node = list_tail(ih); node != NULL; node = list_prev(ih, node)) { + if (ioport == node->vmih_ioport) { + /* Reject duplicate port hook */ + return (EEXIST); + } else if (ioport > node->vmih_ioport) { + break; + } + } + + hook = kmem_alloc(sizeof (*hook), KM_SLEEP); + hook->vmih_ioport = ioport; + hook->vmih_arg = arg; + hook->vmih_rmem_cb = rfunc; + hook->vmih_wmem_cb = wfunc; + if (node == NULL) { + list_insert_head(ih, hook); + } else { + list_insert_after(ih, node, hook); + } + + *cookie = (void *)hook; + return (0); +} + +void +vm_ioport_unhook(struct vm *vm, void **cookie) +{ + vm_ioport_hook_t *hook; + list_t *ih = &vm->ioport_hooks; + + hook = *cookie; + list_remove(ih, hook); + kmem_free(hook, sizeof (*hook)); + *cookie = NULL; +} + +int +vm_ioport_handle_hook(struct vm *vm, int cpuid, bool in, int port, int bytes, + uint32_t *val) +{ + vm_ioport_hook_t *hook; + list_t *ih = &vm->ioport_hooks; + int err = 0; + + for (hook = list_head(ih); hook != NULL; hook = list_next(ih, hook)) { + if (hook->vmih_ioport == port) { + break; + } + } + if (hook == NULL) { + return (ENOENT); + } + + if (in) { + uint64_t tval; + + if (hook->vmih_rmem_cb == NULL) { + return (ENOENT); + } + err = hook->vmih_rmem_cb(hook->vmih_arg, (uintptr_t)port, + (uint_t)bytes, &tval); + *val = (uint32_t)tval; + } else { + if (hook->vmih_wmem_cb == NULL) { + return (ENOENT); + } + err = hook->vmih_wmem_cb(hook->vmih_arg, (uintptr_t)port, + (uint_t)bytes, (uint64_t)*val); + } + + return (err); +} + + +#endif /* __FreeBSD__ */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm.conf b/usr/src/uts/i86pc/io/vmm/vmm.conf new file mode 100644 index 0000000000..8833076014 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm.conf @@ -0,0 +1 @@ +name="vmm" parent="pseudo"; diff --git a/usr/src/uts/i86pc/io/vmm/vmm.mapfile b/usr/src/uts/i86pc/io/vmm/vmm.mapfile new file mode 100644 index 0000000000..83c14de895 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm.mapfile @@ -0,0 +1,62 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # bhyve driver API + vmm_drv_hold; + vmm_drv_rele; + vmm_drv_release_reqd; + vmm_drv_lease_sign; + vmm_drv_lease_break; + vmm_drv_lease_expired; + vmm_drv_gpa2kva; + vmm_drv_ioport_hook; + vmm_drv_ioport_unhook; + vmm_drv_msi; + + # IOMMU API for PCI pass-thru + iommu_add_device; + iommu_host_domain; + iommu_remove_device; + lapic_intr_msi; + vm_iommu_domain; + vm_map_mmio; + vm_unmap_mmio; + + local: + *; +}; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.c b/usr/src/uts/i86pc/io/vmm/vmm_host.c new file mode 100644 index 0000000000..9e390c93dd --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_host.c @@ -0,0 +1,207 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/pcpu.h> + +#include <machine/cpufunc.h> +#include <machine/segments.h> +#include <machine/specialreg.h> + +#include "vmm_host.h" + +static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4, + vmm_host_xcr0; +static struct xsave_limits vmm_xsave_limits; + +void +vmm_host_state_init(void) +{ + unsigned int regs[4]; + + vmm_host_efer = rdmsr(MSR_EFER); + vmm_host_pat = rdmsr(MSR_PAT); + + /* + * We always want CR0.TS to be set when the processor does a VM exit. + * + * With emulation turned on unconditionally after a VM exit, we are + * able to trap inadvertent use of the FPU until the guest FPU state + * has been safely squirreled away. + */ + vmm_host_cr0 = rcr0() | CR0_TS; + + /* + * On non-PCID or PCID but without INVPCID support machines, + * we flush kernel i.e. global TLB entries, by temporary + * clearing the CR4.PGE bit, see invltlb_glob(). If + * preemption occurs at the wrong time, cached vmm_host_cr4 + * might store the value with CR4.PGE cleared. Since FreeBSD + * requires support for PG_G on amd64, just set it + * unconditionally. + */ + vmm_host_cr4 = rcr4() | CR4_PGE; + + /* + * Only permit a guest to use XSAVE if the host is using + * XSAVE. Only permit a guest to use XSAVE features supported + * by the host. This ensures that the FPU state used by the + * guest is always a subset of the saved guest FPU state. + * + * In addition, only permit known XSAVE features where the + * rules for which features depend on other features is known + * to properly emulate xsetbv. + */ + if (vmm_host_cr4 & CR4_XSAVE) { + vmm_xsave_limits.xsave_enabled = 1; + vmm_host_xcr0 = rxcr(0); + vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0 & + (XFEATURE_AVX | XFEATURE_MPX | XFEATURE_AVX512); + + cpuid_count(0xd, 0x0, regs); + vmm_xsave_limits.xsave_max_size = regs[1]; + } +} + +uint64_t +vmm_get_host_pat(void) +{ + + return (vmm_host_pat); +} + +uint64_t +vmm_get_host_efer(void) +{ + + return (vmm_host_efer); +} + +uint64_t +vmm_get_host_cr0(void) +{ + + return (vmm_host_cr0); +} + +uint64_t +vmm_get_host_cr4(void) +{ + + return (vmm_host_cr4); +} + +uint64_t +vmm_get_host_xcr0(void) +{ + + return (vmm_host_xcr0); +} + +uint64_t +vmm_get_host_datasel(void) +{ + +#ifdef __FreeBSD__ + return (GSEL(GDATA_SEL, SEL_KPL)); +#else + return (SEL_GDT(GDT_KDATA, SEL_KPL)); +#endif + +} + +uint64_t +vmm_get_host_codesel(void) +{ + +#ifdef __FreeBSD__ + return (GSEL(GCODE_SEL, SEL_KPL)); +#else + return (SEL_GDT(GDT_KCODE, SEL_KPL)); +#endif +} + +uint64_t +vmm_get_host_tsssel(void) +{ + +#ifdef __FreeBSD__ + return (GSEL(GPROC0_SEL, SEL_KPL)); +#else + return (SEL_GDT(GDT_KTSS, SEL_KPL)); +#endif +} + +uint64_t +vmm_get_host_fsbase(void) +{ + +#ifdef __FreeBSD__ + return (0); +#else + return (rdmsr(MSR_FSBASE)); +#endif +} + +uint64_t +vmm_get_host_idtrbase(void) +{ + +#ifdef __FreeBSD__ + return (r_idt.rd_base); +#else + desctbr_t idtr; + + rd_idtr(&idtr); + return (idtr.dtr_base); +#endif +} + +const struct xsave_limits * +vmm_get_xsave_limits(void) +{ + + return (&vmm_xsave_limits); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.h b/usr/src/uts/i86pc/io/vmm/vmm_host.h new file mode 100644 index 0000000000..f12047819d --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_host.h @@ -0,0 +1,132 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VMM_HOST_H_ +#define _VMM_HOST_H_ + +#ifndef __FreeBSD__ +#include <sys/cpuvar.h> +#endif + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +struct xsave_limits { + int xsave_enabled; + uint64_t xcr0_allowed; + uint32_t xsave_max_size; +}; + +void vmm_host_state_init(void); + +uint64_t vmm_get_host_pat(void); +uint64_t vmm_get_host_efer(void); +uint64_t vmm_get_host_cr0(void); +uint64_t vmm_get_host_cr4(void); +uint64_t vmm_get_host_xcr0(void); +uint64_t vmm_get_host_datasel(void); +uint64_t vmm_get_host_codesel(void); +uint64_t vmm_get_host_tsssel(void); +uint64_t vmm_get_host_fsbase(void); +uint64_t vmm_get_host_idtrbase(void); +const struct xsave_limits *vmm_get_xsave_limits(void); + +/* + * Inline access to host state that is used on every VM entry + */ +static __inline uint64_t +vmm_get_host_trbase(void) +{ + +#ifdef __FreeBSD__ + return ((uint64_t)PCPU_GET(tssp)); +#else + return ((u_long)CPU->cpu_tss); +#endif +} + +static __inline uint64_t +vmm_get_host_gdtrbase(void) +{ + +#ifdef __FreeBSD__ + return ((uint64_t)&gdt[NGDT * curcpu]); +#else + desctbr_t gdtr; + + rd_gdtr(&gdtr); + return (gdtr.dtr_base); +#endif +} + +#ifdef __FreeBSD__ +struct pcpu; +extern struct pcpu __pcpu[]; +#endif + +static __inline uint64_t +vmm_get_host_gsbase(void) +{ + +#ifdef __FreeBSD__ + return ((uint64_t)&__pcpu[curcpu]); +#else + return (rdmsr(MSR_GSBASE)); +#endif +} + +#ifndef __FreeBSD__ +static __inline uint64_t +vmm_get_host_fssel(void) +{ + return (KFS_SEL); +} + +static __inline uint64_t +vmm_get_host_gssel(void) +{ + return (KGS_SEL); +} +#endif +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c new file mode 100644 index 0000000000..ea96cd8db0 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c @@ -0,0 +1,2684 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 Sandvine, Inc. + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#ifdef _KERNEL +#include <sys/param.h> +#include <sys/pcpu.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/vmparam.h> +#include <machine/vmm.h> +#else /* !_KERNEL */ +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/_iovec.h> + +#include <machine/vmm.h> + +#include <assert.h> +#include <vmmapi.h> +#define KASSERT(exp,msg) assert((exp)) +#endif /* _KERNEL */ + +#include <machine/vmm_instruction_emul.h> +#include <x86/psl.h> +#include <x86/specialreg.h> + +/* struct vie_op.op_type */ +enum { + VIE_OP_TYPE_NONE = 0, + VIE_OP_TYPE_MOV, + VIE_OP_TYPE_MOVSX, + VIE_OP_TYPE_MOVZX, + VIE_OP_TYPE_AND, + VIE_OP_TYPE_OR, + VIE_OP_TYPE_SUB, + VIE_OP_TYPE_TWO_BYTE, + VIE_OP_TYPE_PUSH, + VIE_OP_TYPE_CMP, + VIE_OP_TYPE_POP, + VIE_OP_TYPE_MOVS, + VIE_OP_TYPE_GROUP1, + VIE_OP_TYPE_STOS, + VIE_OP_TYPE_BITTEST, + VIE_OP_TYPE_TWOB_GRP15, + VIE_OP_TYPE_ADD, + VIE_OP_TYPE_LAST +}; + +/* struct vie_op.op_flags */ +#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ +#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ +#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ +#define VIE_OP_F_NO_MODRM (1 << 3) +#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) + +#ifdef _KERNEL +static const struct vie_op two_byte_opcodes[256] = { + [0xAE] = { + .op_byte = 0xAE, + .op_type = VIE_OP_TYPE_TWOB_GRP15, + }, + [0xB6] = { + .op_byte = 0xB6, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xB7] = { + .op_byte = 0xB7, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xBA] = { + .op_byte = 0xBA, + .op_type = VIE_OP_TYPE_BITTEST, + .op_flags = VIE_OP_F_IMM8, + }, + [0xBE] = { + .op_byte = 0xBE, + .op_type = VIE_OP_TYPE_MOVSX, + }, +}; + +static const struct vie_op one_byte_opcodes[256] = { + [0x03] = { + .op_byte = 0x03, + .op_type = VIE_OP_TYPE_ADD, + }, + [0x0F] = { + .op_byte = 0x0F, + .op_type = VIE_OP_TYPE_TWO_BYTE + }, + [0x0B] = { + .op_byte = 0x0B, + .op_type = VIE_OP_TYPE_OR, + }, + [0x2B] = { + .op_byte = 0x2B, + .op_type = VIE_OP_TYPE_SUB, + }, + [0x39] = { + .op_byte = 0x39, + .op_type = VIE_OP_TYPE_CMP, + }, + [0x3B] = { + .op_byte = 0x3B, + .op_type = VIE_OP_TYPE_CMP, + }, + [0x88] = { + .op_byte = 0x88, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x89] = { + .op_byte = 0x89, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8A] = { + .op_byte = 0x8A, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8B] = { + .op_byte = 0x8B, + .op_type = VIE_OP_TYPE_MOV, + }, + [0xA1] = { + .op_byte = 0xA1, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA3] = { + .op_byte = 0xA3, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA4] = { + .op_byte = 0xA4, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xA5] = { + .op_byte = 0xA5, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAA] = { + .op_byte = 0xAA, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAB] = { + .op_byte = 0xAB, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xC6] = { + /* XXX Group 11 extended opcode - not just MOV */ + .op_byte = 0xC6, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM8, + }, + [0xC7] = { + .op_byte = 0xC7, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM, + }, + [0x23] = { + .op_byte = 0x23, + .op_type = VIE_OP_TYPE_AND, + }, + [0x80] = { + /* Group 1 extended opcode */ + .op_byte = 0x80, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x81] = { + /* Group 1 extended opcode */ + .op_byte = 0x81, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM, + }, + [0x83] = { + /* Group 1 extended opcode */ + .op_byte = 0x83, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x8F] = { + /* XXX Group 1A extended opcode - not just POP */ + .op_byte = 0x8F, + .op_type = VIE_OP_TYPE_POP, + }, + [0xFF] = { + /* XXX Group 5 extended opcode - not just PUSH */ + .op_byte = 0xFF, + .op_type = VIE_OP_TYPE_PUSH, + } +}; +#endif + +/* struct vie.mod */ +#define VIE_MOD_INDIRECT 0 +#define VIE_MOD_INDIRECT_DISP8 1 +#define VIE_MOD_INDIRECT_DISP32 2 +#define VIE_MOD_DIRECT 3 + +/* struct vie.rm */ +#define VIE_RM_SIB 4 +#define VIE_RM_DISP32 5 + +#define GB (1024 * 1024 * 1024) + +static enum vm_reg_name gpr_map[16] = { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15 +}; + +static uint64_t size2mask[] = { + [1] = 0xff, + [2] = 0xffff, + [4] = 0xffffffff, + [8] = 0xffffffffffffffff, +}; + +static int +vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) +{ + int error; + + error = vm_get_register(vm, vcpuid, reg, rval); + + return (error); +} + +static void +vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) +{ + *lhbr = 0; + *reg = gpr_map[vie->reg]; + + /* + * 64-bit mode imposes limitations on accessing legacy high byte + * registers (lhbr). + * + * The legacy high-byte registers cannot be addressed if the REX + * prefix is present. In this case the values 4, 5, 6 and 7 of the + * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. + * + * If the REX prefix is not present then the values 4, 5, 6 and 7 + * of the 'ModRM:reg' field address the legacy high-byte registers, + * %ah, %ch, %dh and %bh respectively. + */ + if (!vie->rex_present) { + if (vie->reg & 0x4) { + *lhbr = 1; + *reg = gpr_map[vie->reg & 0x3]; + } + } +} + +static int +vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) +{ + uint64_t val; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &val); + + /* + * To obtain the value of a legacy high byte register shift the + * base register right by 8 bits (%ah = %rax >> 8). + */ + if (lhbr) + *rval = val >> 8; + else + *rval = val; + return (error); +} + +static int +vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) +{ + uint64_t origval, val, mask; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &origval); + if (error == 0) { + val = byte; + mask = 0xff; + if (lhbr) { + /* + * Shift left by 8 to store 'byte' in a legacy high + * byte register. + */ + val <<= 8; + mask <<= 8; + } + val |= origval & ~mask; + error = vm_set_register(vm, vcpuid, reg, val); + } + return (error); +} + +int +vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size) +{ + int error; + uint64_t origval; + + switch (size) { + case 1: + case 2: + error = vie_read_register(vm, vcpuid, reg, &origval); + if (error) + return (error); + val &= size2mask[size]; + val |= origval & ~size2mask[size]; + break; + case 4: + val &= 0xffffffffUL; + break; + case 8: + break; + default: + return (EINVAL); + } + + error = vm_set_register(vm, vcpuid, reg, val); + return (error); +} + +#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) + +/* + * Return the status flags that would result from doing (x - y). + */ +#define GETCC(sz) \ +static u_long \ +getcc##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + u_long rflags; \ + \ + __asm __volatile("sub %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack + +GETCC(8); +GETCC(16); +GETCC(32); +GETCC(64); + +static u_long +getcc(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getcc: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getcc8(x, y)); + else if (opsize == 2) + return (getcc16(x, y)); + else if (opsize == 4) + return (getcc32(x, y)); + else + return (getcc64(x, y)); +} + +/* + * Macro creation of functions getaddflags{8,16,32,64} + */ +#define GETADDFLAGS(sz) \ +static u_long \ +getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + u_long rflags; \ + \ + __asm __volatile("add %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack + +GETADDFLAGS(8); +GETADDFLAGS(16); +GETADDFLAGS(32); +GETADDFLAGS(64); + +static u_long +getaddflags(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getaddflags: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getaddflags8(x, y)); + else if (opsize == 2) + return (getaddflags16(x, y)); + else if (opsize == 4) + return (getaddflags32(x, y)); + else + return (getaddflags64(x, y)); +} + +static int +emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint8_t byte; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x88: + /* + * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) + * 88/r: mov r/m8, r8 + * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) + */ + size = 1; /* override for byte operation */ + error = vie_read_bytereg(vm, vcpuid, vie, &byte); + if (error == 0) + error = memwrite(vm, vcpuid, gpa, byte, size, arg); + break; + case 0x89: + /* + * MOV from reg (ModRM:reg) to mem (ModRM:r/m) + * 89/r: mov r/m16, r16 + * 89/r: mov r/m32, r32 + * REX.W + 89/r mov r/m64, r64 + */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0x8A: + /* + * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) + * 8A/r: mov r8, r/m8 + * REX + 8A/r: mov r8, r/m8 + */ + size = 1; /* override for byte operation */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) + error = vie_write_bytereg(vm, vcpuid, vie, val); + break; + case 0x8B: + /* + * MOV from mem (ModRM:r/m) to reg (ModRM:reg) + * 8B/r: mov r16, r/m16 + * 8B/r: mov r32, r/m32 + * REX.W 8B/r: mov r64, r/m64 + */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = gpr_map[vie->reg]; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA1: + /* + * MOV from seg:moffset to AX/EAX/RAX + * A1: mov AX, moffs16 + * A1: mov EAX, moffs32 + * REX.W + A1: mov RAX, moffs64 + */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = VM_REG_GUEST_RAX; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA3: + /* + * MOV from AX/EAX/RAX to seg:moffset + * A3: mov moffs16, AX + * A3: mov moffs32, EAX + * REX.W + A3: mov moffs64, RAX + */ + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0xC6: + /* + * MOV from imm8 to mem (ModRM:r/m) + * C6/0 mov r/m8, imm8 + * REX + C6/0 mov r/m8, imm8 + */ + size = 1; /* override for byte operation */ + error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); + break; + case 0xC7: + /* + * MOV from imm16/imm32 to mem (ModRM:r/m) + * C7/0 mov r/m16, imm16 + * C7/0 mov r/m32, imm32 + * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) + */ + val = vie->immediate & size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + break; + default: + break; + } + + return (error); +} + +static int +emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, + void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0xB6: + /* + * MOV and zero extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B6/r movzx r16, r/m8 + * 0F B6/r movzx r32, r/m8 + * REX.W + 0F B6/r movzx r64, r/m8 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val, 1, arg); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* zero-extend byte */ + val = (uint8_t)val; + + /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xB7: + /* + * MOV and zero extend word from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B7/r movzx r32, r/m16 + * REX.W + 0F B7/r movzx r64, r/m16 + */ + error = memread(vm, vcpuid, gpa, &val, 2, arg); + if (error) + return (error); + + reg = gpr_map[vie->reg]; + + /* zero-extend word */ + val = (uint16_t)val; + + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xBE: + /* + * MOV and sign extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F BE/r movsx r16, r/m8 + * 0F BE/r movsx r32, r/m8 + * REX.W + 0F BE/r movsx r64, r/m8 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val, 1, arg); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* sign extend byte */ + val = (int8_t)val; + + /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + default: + break; + } + return (error); +} + +/* + * Helper function to calculate and validate a linear address. + */ +static int +get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, + int opsize, int addrsize, int prot, enum vm_reg_name seg, + enum vm_reg_name gpr, uint64_t *gla, int *fault) +{ + struct seg_desc desc; + uint64_t cr0, val, rflags; + int error; + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vm_get_seg_desc(vm, vcpuid, seg, &desc); + KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", + __func__, error, seg)); + + error = vie_read_register(vm, vcpuid, gpr, &val); + KASSERT(error == 0, ("%s: error %d getting register %d", __func__, + error, gpr)); + + if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, + addrsize, prot, gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + goto guest_fault; + } + + if (vie_canonical_check(paging->cpu_mode, *gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + goto guest_fault; + } + + if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { + vm_inject_ac(vm, vcpuid, 0); + goto guest_fault; + } + + *fault = 0; + return (0); + +guest_fault: + *fault = 1; + return (0); +} + +static int +emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; + uint64_t rcx, rdi, rsi, rflags; + int error, fault, opsize, seg, repeat; + + opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; + val = 0; + error = 0; + + /* + * XXX although the MOVS instruction is only supposed to be used with + * the "rep" prefix some guests like FreeBSD will use "repnz" instead. + * + * Empirically the "repnz" prefix has identical behavior to "rep" + * and the zero flag does not make a difference. + */ + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) { + error = 0; + goto done; + } + } + + /* + * Source Destination Comments + * -------------------------------------------- + * (1) memory memory n/a + * (2) memory mmio emulated + * (3) mmio memory emulated + * (4) mmio mmio emulated + * + * At this point we don't have sufficient information to distinguish + * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this + * out because it will succeed only when operating on regular memory. + * + * XXX the emulation doesn't properly handle the case where 'gpa' + * is straddling the boundary between the normal memory and MMIO. + */ + + seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); + if (error || fault) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, + copyinfo, nitems(copyinfo), &fault); + if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + + /* + * case (2): read from system memory and write to mmio. + */ + vm_copyin(vm, vcpuid, copyinfo, &val, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + if (error) + goto done; + } else { + /* + * 'vm_copy_setup()' is expected to fail for cases (3) and (4) + * if 'srcaddr' is in the mmio space. + */ + + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, + &fault); + if (error || fault) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, + PROT_WRITE, copyinfo, nitems(copyinfo), &fault); + if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + + /* + * case (3): read from MMIO and write to system memory. + * + * A MMIO read can have side-effects so we + * commit to it only after vm_copy_setup() is + * successful. If a page-fault needs to be + * injected into the guest then it will happen + * before the MMIO read is attempted. + */ + error = memread(vm, vcpuid, gpa, &val, opsize, arg); + if (error) + goto done; + + vm_copyout(vm, vcpuid, &val, copyinfo, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + } else { + /* + * Case (4): read from and write to mmio. + * + * Commit to the MMIO read/write (with potential + * side-effects) only after we are sure that the + * instruction is not going to be restarted due + * to address translation faults. + */ + error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, + PROT_READ, &srcgpa, &fault); + if (error || fault) + goto done; + + error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, + PROT_WRITE, &dstgpa, &fault); + if (error || fault) + goto done; + + error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); + if (error) + goto done; + + error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); + if (error) + goto done; + } + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); + KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) { + rsi -= opsize; + rdi -= opsize; + } else { + rsi += opsize; + rdi += opsize; + } + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } +done: + KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", + __func__, error)); + return (error); +} + +static int +emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error, opsize, repeat; + uint64_t val; + uint64_t rcx, rdi, rflags; + + opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + KASSERT(!error, ("%s: error %d getting rax", __func__, error)); + + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) + rdi -= opsize; + else + rdi += opsize; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } + + return (0); +} + +static int +emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x23: + /* + * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 23/r and r16, r/m16 + * 23/r and r32, r/m32 + * REX.W + 23/r and r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 & val2; + error = vie_update_register(vm, vcpuid, reg, result, size); + break; + case 0x81: + case 0x83: + /* + * AND mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /4 and r/m16, imm16 + * 81 /4 and r/m32, imm32 + * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 + * + * 83 /4 and r/m16, imm8 sign-extended to 16 + * 83 /4 and r/m32, imm8 sign-extended to 32 + * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 & vie->immediate; + error = memwrite(vm, vcpuid, gpa, result, size, arg); + break; + default: + break; + } + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x0B: + /* + * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 0b/r or r16, r/m16 + * 0b/r or r32, r/m32 + * REX.W + 0b/r or r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 | val2; + error = vie_update_register(vm, vcpuid, reg, result, size); + break; + case 0x81: + case 0x83: + /* + * OR mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /1 or r/m16, imm16 + * 81 /1 or r/m32, imm32 + * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 + * + * 83 /1 or r/m16, imm8 sign-extended to 16 + * 83 /1 or r/m32, imm8 sign-extended to 32 + * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 | vie->immediate; + error = memwrite(vm, vcpuid, gpa, result, size, arg); + break; + default: + break; + } + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t regop, memop, op1, op2, rflags, rflags2; + enum vm_reg_name reg; + + size = vie->opsize; + switch (vie->op.op_byte) { + case 0x39: + case 0x3B: + /* + * 39/r CMP r/m16, r16 + * 39/r CMP r/m32, r32 + * REX.W 39/r CMP r/m64, r64 + * + * 3B/r CMP r16, r/m16 + * 3B/r CMP r32, r/m32 + * REX.W + 3B/r CMP r64, r/m64 + * + * Compare the first operand with the second operand and + * set status flags in EFLAGS register. The comparison is + * performed by subtracting the second operand from the first + * operand and then setting the status flags. + */ + + /* Get the register operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, ®op); + if (error) + return (error); + + /* Get the memory operand */ + error = memread(vm, vcpuid, gpa, &memop, size, arg); + if (error) + return (error); + + if (vie->op.op_byte == 0x3B) { + op1 = regop; + op2 = memop; + } else { + op1 = memop; + op2 = regop; + } + rflags2 = getcc(size, op1, op2); + break; + case 0x80: + case 0x81: + case 0x83: + /* + * 80 /7 cmp r/m8, imm8 + * REX + 80 /7 cmp r/m8, imm8 + * + * 81 /7 cmp r/m16, imm16 + * 81 /7 cmp r/m32, imm32 + * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 + * + * 83 /7 cmp r/m16, imm8 sign-extended to 16 + * 83 /7 cmp r/m32, imm8 sign-extended to 32 + * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 + * + * Compare mem (ModRM:r/m) with immediate and set + * status flags according to the results. The + * comparison is performed by subtracting the + * immediate from the first operand and then setting + * the status flags. + * + */ + if (vie->op.op_byte == 0x80) + size = 1; + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &op1, size, arg); + if (error) + return (error); + + rflags2 = getcc(size, op1, vie->immediate); + break; + default: + return (EINVAL); + } + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x03: + /* + * ADD r/m to r and store the result in r + * + * 03/r ADD r16, r/m16 + * 03/r ADD r32, r/m32 + * REX.W + 03/r ADD r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 + val2; + error = vie_update_register(vm, vcpuid, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getaddflags(size, val1, val2); + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + +static int +emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x2B: + /* + * SUB r/m from r and store the result in r + * + * 2B/r SUB r16, r/m16 + * 2B/r SUB r32, r/m32 + * REX.W + 2B/r SUB r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 - val2; + error = vie_update_register(vm, vcpuid, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getcc(size, val1, val2); + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + +static int +emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + struct seg_desc ss_desc; + uint64_t cr0, rflags, rsp, stack_gla, val; + int error, fault, size, stackaddrsize, pushop; + + val = 0; + size = vie->opsize; + pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; + + /* + * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 + */ + if (paging->cpu_mode == CPU_MODE_REAL) { + stackaddrsize = 2; + } else if (paging->cpu_mode == CPU_MODE_64BIT) { + /* + * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 + * - Stack pointer size is always 64-bits. + * - PUSH/POP of 32-bit values is not possible in 64-bit mode. + * - 16-bit PUSH/POP is supported by using the operand size + * override prefix (66H). + */ + stackaddrsize = 8; + size = vie->opsize_override ? 2 : 8; + } else { + /* + * In protected or compatibility mode the 'B' flag in the + * stack-segment descriptor determines the size of the + * stack pointer. + */ + error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); + KASSERT(error == 0, ("%s: error %d getting SS descriptor", + __func__, error)); + if (SEG_DESC_DEF32(ss_desc.access)) + stackaddrsize = 4; + else + stackaddrsize = 2; + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); + KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); + if (pushop) { + rsp -= size; + } + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, + rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, + &stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_canonical_check(paging->cpu_mode, stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (0); + } + + error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, + pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), + &fault); + if (error || fault) + return (error); + + if (pushop) { + error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); + if (error == 0) + vm_copyout(vm, vcpuid, &val, copyinfo, size); + } else { + vm_copyin(vm, vcpuid, copyinfo, &val, size); + error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); + rsp += size; + } + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + + if (error == 0) { + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, + stackaddrsize); + KASSERT(error == 0, ("error %d updating rsp", error)); + } + return (error); +} + +static int +emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * PUSH is part of the group 5 extended opcodes and is identified + * by ModRM:reg = b110. + */ + if ((vie->reg & 7) != 6) + return (EINVAL); + + error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, + memwrite, arg); + return (error); +} + +static int +emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * POP is part of the group 1A extended opcodes and is identified + * by ModRM:reg = b000. + */ + if ((vie->reg & 7) != 0) + return (EINVAL); + + error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, + memwrite, arg); + return (error); +} + +static int +emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + switch (vie->reg & 7) { + case 0x1: /* OR */ + error = emulate_or(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case 0x4: /* AND */ + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case 0x7: /* CMP */ + error = emulate_cmp(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static int +emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +{ + uint64_t val, rflags; + int error, bitmask, bitoff; + + /* + * 0F BA is a Group 8 extended opcode. + * + * Currently we only emulate the 'Bit Test' instruction which is + * identified by a ModR/M:reg encoding of 100b. + */ + if ((vie->reg & 7) != 4) + return (EINVAL); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); + if (error) + return (error); + + /* + * Intel SDM, Vol 2, Table 3-2: + * "Range of Bit Positions Specified by Bit Offset Operands" + */ + bitmask = vie->opsize * 8 - 1; + bitoff = vie->immediate & bitmask; + + /* Copy the bit into the Carry flag in %rflags */ + if (val & (1UL << bitoff)) + rflags |= PSL_C; + else + rflags &= ~PSL_C; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); + + return (0); +} + +static int +emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +{ + int error; + uint64_t buf; + + switch (vie->reg & 7) { + case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ + if (vie->mod == 0x3) { + /* + * SFENCE. Ignore it, VM exit provides enough + * barriers on its own. + */ + error = 0; + } else { + /* + * CLFLUSH, CLFLUSHOPT. Only check for access + * rights. + */ + error = memread(vm, vcpuid, gpa, &buf, 1, memarg); + } + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + if (!vie->decoded) + return (EINVAL); + + switch (vie->op.op_type) { + case VIE_OP_TYPE_GROUP1: + error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_POP: + error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_PUSH: + error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_CMP: + error = emulate_cmp(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOV: + error = emulate_mov(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOVSX: + case VIE_OP_TYPE_MOVZX: + error = emulate_movx(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOVS: + error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_STOS: + error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_AND: + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_OR: + error = emulate_or(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_SUB: + error = emulate_sub(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_BITTEST: + error = emulate_bittest(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_TWOB_GRP15: + error = emulate_twob_group15(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_ADD: + error = emulate_add(vm, vcpuid, gpa, vie, memread, + memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("%s: invalid size %d", __func__, size)); + KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); + + if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) + return (0); + + return ((gla & (size - 1)) ? 1 : 0); +} + +int +vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) +{ + uint64_t mask; + + if (cpu_mode != CPU_MODE_64BIT) + return (0); + + /* + * The value of the bit 47 in the 'gla' should be replicated in the + * most significant 16 bits. + */ + mask = ~((1UL << 48) - 1); + if (gla & (1UL << 47)) + return ((gla & mask) != mask); + else + return ((gla & mask) != 0); +} + +uint64_t +vie_size2mask(int size) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("vie_size2mask: invalid size %d", size)); + return (size2mask[size]); +} + +int +vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t offset, int length, int addrsize, + int prot, uint64_t *gla) +{ + uint64_t firstoff, low_limit, high_limit, segbase; + int glasize, type; + + KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, + ("%s: invalid segment %d", __func__, seg)); + KASSERT(length == 1 || length == 2 || length == 4 || length == 8, + ("%s: invalid operand size %d", __func__, length)); +#ifdef __FreeBSD__ + KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, + ("%s: invalid prot %#x", __func__, prot)); +#else + KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, + ("%s: invalid prot %x", __func__, prot)); +#endif + + firstoff = offset; + if (cpu_mode == CPU_MODE_64BIT) { + KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " + "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); + glasize = 8; + } else { + KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " + "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); + glasize = 4; + /* + * If the segment selector is loaded with a NULL selector + * then the descriptor is unusable and attempting to use + * it results in a #GP(0). + */ + if (SEG_DESC_UNUSABLE(desc->access)) + return (-1); + + /* + * The processor generates a #NP exception when a segment + * register is loaded with a selector that points to a + * descriptor that is not present. If this was the case then + * it would have been checked before the VM-exit. + */ +#ifdef __FreeBSD__ + KASSERT(SEG_DESC_PRESENT(desc->access), + ("segment %d not present: %#x", seg, desc->access)); +#else + KASSERT(SEG_DESC_PRESENT(desc->access), + ("segment %d not present: %x", seg, desc->access)); +#endif + + /* + * The descriptor type must indicate a code/data segment. + */ + type = SEG_DESC_TYPE(desc->access); +#ifdef __FreeBSD__ + KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " + "descriptor type %#x", seg, type)); +#else + KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " + "descriptor type %x", seg, type)); +#endif + + if (prot & PROT_READ) { + /* #GP on a read access to a exec-only code segment */ + if ((type & 0xA) == 0x8) + return (-1); + } + + if (prot & PROT_WRITE) { + /* + * #GP on a write access to a code segment or a + * read-only data segment. + */ + if (type & 0x8) /* code segment */ + return (-1); + + if ((type & 0xA) == 0) /* read-only data seg */ + return (-1); + } + + /* + * 'desc->limit' is fully expanded taking granularity into + * account. + */ + if ((type & 0xC) == 0x4) { + /* expand-down data segment */ + low_limit = desc->limit + 1; + high_limit = SEG_DESC_DEF32(desc->access) ? + 0xffffffff : 0xffff; + } else { + /* code segment or expand-up data segment */ + low_limit = 0; + high_limit = desc->limit; + } + + while (length > 0) { + offset &= vie_size2mask(addrsize); + if (offset < low_limit || offset > high_limit) + return (-1); + offset++; + length--; + } + } + + /* + * In 64-bit mode all segments except %fs and %gs have a segment + * base address of 0. + */ + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + segbase = desc->base; + } + + /* + * Truncate 'firstoff' to the effective address size before adding + * it to the segment base. + */ + firstoff &= vie_size2mask(addrsize); + *gla = (segbase + firstoff) & vie_size2mask(glasize); + return (0); +} + +#ifdef _KERNEL +void +vie_init(struct vie *vie, const char *inst_bytes, int inst_length) +{ + KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, + ("%s: invalid instruction length (%d)", __func__, inst_length)); + + bzero(vie, sizeof(struct vie)); + + vie->base_register = VM_REG_LAST; + vie->index_register = VM_REG_LAST; + vie->segment_register = VM_REG_LAST; + + if (inst_length) { + bcopy(inst_bytes, vie->inst, inst_length); + vie->num_valid = inst_length; + } +} + +static int +pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) +{ + int error_code = 0; + + if (pte & PG_V) + error_code |= PGEX_P; + if (prot & VM_PROT_WRITE) + error_code |= PGEX_W; + if (usermode) + error_code |= PGEX_U; + if (rsvd) + error_code |= PGEX_RSV; + if (prot & VM_PROT_EXECUTE) + error_code |= PGEX_I; + + return (error_code); +} + +static void +ptp_release(void **cookie) +{ + if (*cookie != NULL) { + vm_gpa_release(*cookie); + *cookie = NULL; + } +} + +static void * +ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) +{ + void *ptr; + + ptp_release(cookie); + ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie); + return (ptr); +} + +static int +_vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) +{ + int nlevels, pfcode, retval, usermode, writable; + int ptpshift = 0, ptpindex = 0; + uint64_t ptpphys; + uint64_t *ptpbase = NULL, pte = 0, pgsize = 0; +#ifdef __FreeBSD__ + u_int retries; +#endif + uint32_t *ptpbase32, pte32; + void *cookie; + + *guest_fault = 0; + + usermode = (paging->cpl == 3 ? 1 : 0); + writable = prot & VM_PROT_WRITE; + cookie = NULL; + retval = 0; +#ifdef __FreeBSD__ + retries = 0; +#endif +restart: + ptpphys = paging->cr3; /* root of the page tables */ + ptp_release(&cookie); +#ifdef __FreeBSD__ + if (retries++ > 0) + maybe_yield(); +#endif + + if (vie_canonical_check(paging->cpu_mode, gla)) { + /* + * XXX assuming a non-stack reference otherwise a stack fault + * should be generated. + */ + if (!check_only) + vm_inject_gp(vm, vcpuid); + goto fault; + } + + if (paging->paging_mode == PAGING_MODE_FLAT) { + *gpa = gla; + goto done; + } + + if (paging->paging_mode == PAGING_MODE_32) { + nlevels = 2; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits. */ + ptpphys &= ~0xfff; + + ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, + &cookie); + + if (ptpbase32 == NULL) + goto error; + + ptpshift = PAGE_SHIFT + nlevels * 10; + ptpindex = (gla >> ptpshift) & 0x3FF; + pgsize = 1UL << ptpshift; + + pte32 = ptpbase32[ptpindex]; + + if ((pte32 & PG_V) == 0 || + (usermode && (pte32 & PG_U) == 0) || + (writable && (pte32 & PG_RW) == 0)) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, + pte32); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } + goto fault; + } + + /* + * Emulate the x86 MMU's management of the accessed + * and dirty flags. While the accessed flag is set + * at every level of the page table, the dirty flag + * is only set at the last level providing the guest + * physical address. + */ + if (!check_only && (pte32 & PG_A) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_A) == 0) { + goto restart; + } + } + + /* XXX must be ignored if CR4.PSE=0 */ + if (nlevels > 0 && (pte32 & PG_PS) != 0) + break; + + ptpphys = pte32; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (!check_only && writable && (pte32 & PG_M) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_M) == 0) { + goto restart; + } + } + + /* Zero out the lower 'ptpshift' bits */ + pte32 >>= ptpshift; pte32 <<= ptpshift; + *gpa = pte32 | (gla & (pgsize - 1)); + goto done; + } + + if (paging->paging_mode == PAGING_MODE_PAE) { + /* Zero out the lower 5 bits and the upper 32 bits */ + ptpphys &= 0xffffffe0UL; + + ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4, + &cookie); + if (ptpbase == NULL) + goto error; + + ptpindex = (gla >> 30) & 0x3; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } + goto fault; + } + + ptpphys = pte; + + nlevels = 2; + } else + nlevels = 4; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits and the upper 12 bits */ + ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; + + ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); + if (ptpbase == NULL) + goto error; + + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gla >> ptpshift) & 0x1FF; + pgsize = 1UL << ptpshift; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0 || + (usermode && (pte & PG_U) == 0) || + (writable && (pte & PG_RW) == 0)) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } + goto fault; + } + + /* Set the accessed bit in the page table entry */ + if (!check_only && (pte & PG_A) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], + pte, pte | PG_A) == 0) { + goto restart; + } + } + + if (nlevels > 0 && (pte & PG_PS) != 0) { + if (pgsize > 1 * GB) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 1, + pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } + goto fault; + } + break; + } + + ptpphys = pte; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (!check_only && writable && (pte & PG_M) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) + goto restart; + } + + /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ + pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; + *gpa = pte | (gla & (pgsize - 1)); +done: + ptp_release(&cookie); + KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", + __func__, retval)); + return (retval); +error: + retval = EFAULT; + goto done; +fault: + *guest_fault = 1; + goto done; +} + +int +vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + + return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, + false)); +} + +int +vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + + return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, + true)); +} + +int +vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t rip, int inst_length, struct vie *vie, int *faultptr) +{ + struct vm_copyinfo copyinfo[2]; + int error, prot; + + if (inst_length > VIE_INST_SIZE) + panic("vmm_fetch_instruction: invalid length %d", inst_length); + + prot = PROT_READ | PROT_EXEC; + error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, + copyinfo, nitems(copyinfo), faultptr); + if (error || *faultptr) + return (error); + + vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vie->num_valid = inst_length; + return (0); +} + +static int +vie_peek(struct vie *vie, uint8_t *x) +{ + + if (vie->num_processed < vie->num_valid) { + *x = vie->inst[vie->num_processed]; + return (0); + } else + return (-1); +} + +static void +vie_advance(struct vie *vie) +{ + + vie->num_processed++; +} + +static bool +segment_override(uint8_t x, int *seg) +{ + + switch (x) { + case 0x2E: + *seg = VM_REG_GUEST_CS; + break; + case 0x36: + *seg = VM_REG_GUEST_SS; + break; + case 0x3E: + *seg = VM_REG_GUEST_DS; + break; + case 0x26: + *seg = VM_REG_GUEST_ES; + break; + case 0x64: + *seg = VM_REG_GUEST_FS; + break; + case 0x65: + *seg = VM_REG_GUEST_GS; + break; + default: + return (false); + } + return (true); +} + +static int +decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) +{ + uint8_t x; + + while (1) { + if (vie_peek(vie, &x)) + return (-1); + + if (x == 0x66) + vie->opsize_override = 1; + else if (x == 0x67) + vie->addrsize_override = 1; + else if (x == 0xF3) + vie->repz_present = 1; + else if (x == 0xF2) + vie->repnz_present = 1; + else if (segment_override(x, &vie->segment_register)) + vie->segment_override = 1; + else + break; + + vie_advance(vie); + } + + /* + * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: + * - Only one REX prefix is allowed per instruction. + * - The REX prefix must immediately precede the opcode byte or the + * escape opcode byte. + * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) + * the mandatory prefix must come before the REX prefix. + */ + if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { + vie->rex_present = 1; + vie->rex_w = x & 0x8 ? 1 : 0; + vie->rex_r = x & 0x4 ? 1 : 0; + vie->rex_x = x & 0x2 ? 1 : 0; + vie->rex_b = x & 0x1 ? 1 : 0; + vie_advance(vie); + } + + /* + * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 + */ + if (cpu_mode == CPU_MODE_64BIT) { + /* + * Default address size is 64-bits and default operand size + * is 32-bits. + */ + vie->addrsize = vie->addrsize_override ? 4 : 8; + if (vie->rex_w) + vie->opsize = 8; + else if (vie->opsize_override) + vie->opsize = 2; + else + vie->opsize = 4; + } else if (cs_d) { + /* Default address and operand sizes are 32-bits */ + vie->addrsize = vie->addrsize_override ? 2 : 4; + vie->opsize = vie->opsize_override ? 2 : 4; + } else { + /* Default address and operand sizes are 16-bits */ + vie->addrsize = vie->addrsize_override ? 4 : 2; + vie->opsize = vie->opsize_override ? 4 : 2; + } + return (0); +} + +static int +decode_two_byte_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + vie->op = two_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + return (0); +} + +static int +decode_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + vie->op = one_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + + if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) + return (decode_two_byte_opcode(vie)); + + return (0); +} + +static int +decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) +{ + uint8_t x; + + if (vie->op.op_flags & VIE_OP_F_NO_MODRM) + return (0); + + if (cpu_mode == CPU_MODE_REAL) + return (-1); + + if (vie_peek(vie, &x)) + return (-1); + + vie->mod = (x >> 6) & 0x3; + vie->rm = (x >> 0) & 0x7; + vie->reg = (x >> 3) & 0x7; + + /* + * A direct addressing mode makes no sense in the context of an EPT + * fault. There has to be a memory access involved to cause the + * EPT fault. + */ + if (vie->mod == VIE_MOD_DIRECT) + return (-1); + + if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || + (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { + /* + * Table 2-5: Special Cases of REX Encodings + * + * mod=0, r/m=5 is used in the compatibility mode to + * indicate a disp32 without a base register. + * + * mod!=3, r/m=4 is used in the compatibility mode to + * indicate that the SIB byte is present. + * + * The 'b' bit in the REX prefix is don't care in + * this case. + */ + } else { + vie->rm |= (vie->rex_b << 3); + } + + vie->reg |= (vie->rex_r << 3); + + /* SIB */ + if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) + goto done; + + vie->base_register = gpr_map[vie->rm]; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + case VIE_MOD_INDIRECT: + if (vie->rm == VIE_RM_DISP32) { + vie->disp_bytes = 4; + /* + * Table 2-7. RIP-Relative Addressing + * + * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 + * whereas in compatibility mode it just implies disp32. + */ + + if (cpu_mode == CPU_MODE_64BIT) + vie->base_register = VM_REG_GUEST_RIP; + else + vie->base_register = VM_REG_LAST; + } + break; + } + +done: + vie_advance(vie); + + return (0); +} + +static int +decode_sib(struct vie *vie) +{ + uint8_t x; + + /* Proceed only if SIB byte is present */ + if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) + return (0); + + if (vie_peek(vie, &x)) + return (-1); + + /* De-construct the SIB byte */ + vie->ss = (x >> 6) & 0x3; + vie->index = (x >> 3) & 0x7; + vie->base = (x >> 0) & 0x7; + + /* Apply the REX prefix modifiers */ + vie->index |= vie->rex_x << 3; + vie->base |= vie->rex_b << 3; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + } + + if (vie->mod == VIE_MOD_INDIRECT && + (vie->base == 5 || vie->base == 13)) { + /* + * Special case when base register is unused if mod = 0 + * and base = %rbp or %r13. + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + vie->disp_bytes = 4; + } else { + vie->base_register = gpr_map[vie->base]; + } + + /* + * All encodings of 'index' are valid except for %rsp (4). + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + if (vie->index != 4) + vie->index_register = gpr_map[vie->index]; + + /* 'scale' makes sense only in the context of an index register */ + if (vie->index_register < VM_REG_LAST) + vie->scale = 1 << vie->ss; + + vie_advance(vie); + + return (0); +} + +static int +decode_displacement(struct vie *vie) +{ + int n, i; + uint8_t x; + + union { + char buf[4]; + int8_t signed8; + int32_t signed32; + } u; + + if ((n = vie->disp_bytes) == 0) + return (0); + + if (n != 1 && n != 4) + panic("decode_displacement: invalid disp_bytes %d", n); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + if (n == 1) + vie->displacement = u.signed8; /* sign-extended */ + else + vie->displacement = u.signed32; /* sign-extended */ + + return (0); +} + +static int +decode_immediate(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[4]; + int8_t signed8; + int16_t signed16; + int32_t signed32; + } u; + + /* Figure out immediate operand size (if any) */ + if (vie->op.op_flags & VIE_OP_F_IMM) { + /* + * Section 2.2.1.5 "Immediates", Intel SDM: + * In 64-bit mode the typical size of immediate operands + * remains 32-bits. When the operand size if 64-bits, the + * processor sign-extends all immediates to 64-bits prior + * to their use. + */ + if (vie->opsize == 4 || vie->opsize == 8) + vie->imm_bytes = 4; + else + vie->imm_bytes = 2; + } else if (vie->op.op_flags & VIE_OP_F_IMM8) { + vie->imm_bytes = 1; + } + + if ((n = vie->imm_bytes) == 0) + return (0); + + KASSERT(n == 1 || n == 2 || n == 4, + ("%s: invalid number of immediate bytes: %d", __func__, n)); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + /* sign-extend the immediate value before use */ + if (n == 1) + vie->immediate = u.signed8; + else if (n == 2) + vie->immediate = u.signed16; + else + vie->immediate = u.signed32; + + return (0); +} + +static int +decode_moffset(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[8]; + uint64_t u64; + } u; + + if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) + return (0); + + /* + * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: + * The memory offset size follows the address-size of the instruction. + */ + n = vie->addrsize; + KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); + + u.u64 = 0; + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + vie->displacement = u.u64; + return (0); +} + +/* + * Verify that the 'guest linear address' provided as collateral of the nested + * page table fault matches with our instruction decoding. + */ +static int +verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, + enum vm_cpu_mode cpu_mode) +{ + int error; + uint64_t base, segbase, idx, gla2; + enum vm_reg_name seg; + struct seg_desc desc; + + /* Skip 'gla' verification */ + if (gla == VIE_INVALID_GLA) + return (0); + + base = 0; + if (vie->base_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->base_register, &base); + if (error) { + printf("verify_gla: error %d getting base reg %d\n", + error, vie->base_register); + return (-1); + } + + /* + * RIP-relative addressing starts from the following + * instruction + */ + if (vie->base_register == VM_REG_GUEST_RIP) + base += vie->num_processed; + } + + idx = 0; + if (vie->index_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->index_register, &idx); + if (error) { + printf("verify_gla: error %d getting index reg %d\n", + error, vie->index_register); + return (-1); + } + } + + /* + * From "Specifying a Segment Selector", Intel SDM, Vol 1 + * + * In 64-bit mode, segmentation is generally (but not + * completely) disabled. The exceptions are the FS and GS + * segments. + * + * In legacy IA-32 mode, when the ESP or EBP register is used + * as the base, the SS segment is the default segment. For + * other data references, except when relative to stack or + * string destination the DS segment is the default. These + * can be overridden to allow other segments to be accessed. + */ + if (vie->segment_override) + seg = vie->segment_register; + else if (vie->base_register == VM_REG_GUEST_RSP || + vie->base_register == VM_REG_GUEST_RBP) + seg = VM_REG_GUEST_SS; + else + seg = VM_REG_GUEST_DS; + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + error = vm_get_seg_desc(vm, cpuid, seg, &desc); + if (error) { + printf("verify_gla: error %d getting segment" + " descriptor %d", error, + vie->segment_register); + return (-1); + } + segbase = desc.base; + } + + gla2 = segbase + base + vie->scale * idx + vie->displacement; + gla2 &= size2mask[vie->addrsize]; + if (gla != gla2) { + printf("verify_gla mismatch: segbase(0x%0lx)" + "base(0x%0lx), scale(%d), index(0x%0lx), " + "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", + segbase, base, vie->scale, idx, vie->displacement, + gla, gla2); + return (-1); + } + + return (0); +} + +int +vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, + enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) +{ + + if (decode_prefixes(vie, cpu_mode, cs_d)) + return (-1); + + if (decode_opcode(vie)) + return (-1); + + if (decode_modrm(vie, cpu_mode)) + return (-1); + + if (decode_sib(vie)) + return (-1); + + if (decode_displacement(vie)) + return (-1); + + if (decode_immediate(vie)) + return (-1); + + if (decode_moffset(vie)) + return (-1); + + if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { + if (verify_gla(vm, cpuid, gla, vie, cpu_mode)) + return (-1); + } + + vie->decoded = 1; /* success */ + + return (0); +} +#endif /* _KERNEL */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c new file mode 100644 index 0000000000..3d08fd5e85 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c @@ -0,0 +1,204 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> + +#include "vatpic.h" +#include "vatpit.h" +#include "vpmtmr.h" +#include "vrtc.h" +#include "vmm_ioport.h" +#include "vmm_ktr.h" + +#define MAX_IOPORTS 1280 + +ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { + [TIMER_MODE] = vatpit_handler, + [TIMER_CNTR0] = vatpit_handler, + [TIMER_CNTR1] = vatpit_handler, + [TIMER_CNTR2] = vatpit_handler, + [NMISC_PORT] = vatpit_nmisc_handler, + [IO_ICU1] = vatpic_master_handler, + [IO_ICU1 + ICU_IMR_OFFSET] = vatpic_master_handler, + [IO_ICU2] = vatpic_slave_handler, + [IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler, + [IO_ELCR1] = vatpic_elc_handler, + [IO_ELCR2] = vatpic_elc_handler, + [IO_PMTMR] = vpmtmr_handler, + [IO_RTC] = vrtc_addr_handler, + [IO_RTC + 1] = vrtc_data_handler, +}; + +#ifdef KTR +static const char * +inout_instruction(struct vm_exit *vmexit) +{ + int index; + + static const char *iodesc[] = { + "outb", "outw", "outl", + "inb", "inw", "inl", + "outsb", "outsw", "outsd", + "insb", "insw", "insd", + }; + + switch (vmexit->u.inout.bytes) { + case 1: + index = 0; + break; + case 2: + index = 1; + break; + default: + index = 2; + break; + } + + if (vmexit->u.inout.in) + index += 3; + + if (vmexit->u.inout.string) + index += 6; + + KASSERT(index < nitems(iodesc), ("%s: invalid index %d", + __func__, index)); + + return (iodesc[index]); +} +#endif /* KTR */ + +static int +emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, + bool *retu) +{ + ioport_handler_func_t handler; + uint32_t mask, val; + int error; + +#ifdef __FreeBSD__ + /* + * If there is no handler for the I/O port then punt to userspace. + */ + if (vmexit->u.inout.port >= MAX_IOPORTS || + (handler = ioport_handler[vmexit->u.inout.port]) == NULL) { + *retu = true; + return (0); + } +#else /* __FreeBSD__ */ + handler = NULL; + if (vmexit->u.inout.port < MAX_IOPORTS) { + handler = ioport_handler[vmexit->u.inout.port]; + } + /* Look for hooks, if a standard handler is not present */ + if (handler == NULL) { + mask = vie_size2mask(vmexit->u.inout.bytes); + if (!vmexit->u.inout.in) { + val = vmexit->u.inout.eax & mask; + } + error = vm_ioport_handle_hook(vm, vcpuid, vmexit->u.inout.in, + vmexit->u.inout.port, vmexit->u.inout.bytes, &val); + if (error == 0) { + goto finish; + } + + *retu = true; + return (0); + } + +#endif /* __FreeBSD__ */ + + mask = vie_size2mask(vmexit->u.inout.bytes); + + if (!vmexit->u.inout.in) { + val = vmexit->u.inout.eax & mask; + } + + error = (*handler)(vm, vcpuid, vmexit->u.inout.in, + vmexit->u.inout.port, vmexit->u.inout.bytes, &val); + if (error) { + /* + * The value returned by this function is also the return value + * of vm_run(). This needs to be a positive number otherwise it + * can be interpreted as a "pseudo-error" like ERESTART. + * + * Enforce this by mapping all errors to EIO. + */ + return (EIO); + } + +#ifndef __FreeBSD__ +finish: +#endif /* __FreeBSD__ */ + if (vmexit->u.inout.in) { + vmexit->u.inout.eax &= ~mask; + vmexit->u.inout.eax |= val & mask; + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, + vmexit->u.inout.eax); + KASSERT(error == 0, ("emulate_ioport: error %d setting guest " + "rax register", error)); + } + *retu = false; + return (0); +} + +static int +emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + *retu = true; + return (0); /* Return to userspace to finish emulation */ +} + +int +vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + int bytes, error; + + bytes = vmexit->u.inout.bytes; + KASSERT(bytes == 1 || bytes == 2 || bytes == 4, + ("vm_handle_inout: invalid operand size %d", bytes)); + + if (vmexit->u.inout.string) + error = emulate_inout_str(vm, vcpuid, vmexit, retu); + else + error = emulate_inout_port(vm, vcpuid, vmexit, retu); + + VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s", + vmexit->u.inout.rep ? "rep " : "", + inout_instruction(vmexit), + vmexit->u.inout.port, + error ? "error" : (*retu ? "userspace" : "handled")); + + return (error); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h new file mode 100644 index 0000000000..14e315f400 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_IOPORT_H_ +#define _VMM_IOPORT_H_ + +typedef int (*ioport_handler_func_t)(struct vm *vm, int vcpuid, + bool in, int port, int bytes, uint32_t *val); + +int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu); + +#endif /* _VMM_IOPORT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ktr.h b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h new file mode 100644 index 0000000000..414d0341cc --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h @@ -0,0 +1,71 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_KTR_H_ +#define _VMM_KTR_H_ + +#include <sys/ktr.h> +#include <sys/pcpu.h> + +#ifndef KTR_VMM +#define KTR_VMM KTR_GEN +#endif + +#define VCPU_CTR0(vm, vcpuid, format) \ +CTR2(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid)) + +#define VCPU_CTR1(vm, vcpuid, format, p1) \ +CTR3(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1)) + +#define VCPU_CTR2(vm, vcpuid, format, p1, p2) \ +CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2)) + +#define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \ +CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2), (p3)) + +#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \ +CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \ + (p1), (p2), (p3), (p4)) + +#define VM_CTR0(vm, format) \ +CTR1(KTR_VMM, "vm %s: " format, vm_name((vm))) + +#define VM_CTR1(vm, format, p1) \ +CTR2(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1)) + +#define VM_CTR2(vm, format, p1, p2) \ +CTR3(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2)) + +#define VM_CTR3(vm, format, p1, p2, p3) \ +CTR4(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3)) + +#define VM_CTR4(vm, format, p1, p2, p3, p4) \ +CTR5(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3), (p4)) +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c new file mode 100644 index 0000000000..43b2bebe97 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c @@ -0,0 +1,261 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> + +#include <x86/specialreg.h> +#include <x86/apicreg.h> + +#include <machine/vmm.h> +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vlapic.h" + +/* + * Some MSI message definitions + */ +#define MSI_X86_ADDR_MASK 0xfff00000 +#define MSI_X86_ADDR_BASE 0xfee00000 +#define MSI_X86_ADDR_RH 0x00000008 /* Redirection Hint */ +#define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */ + +int +lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) +{ + struct vlapic *vlapic; + + if (cpu < 0 || cpu >= vm_get_maxcpus(vm)) + return (EINVAL); + + /* + * According to section "Maskable Hardware Interrupts" in Intel SDM + * vectors 16 through 255 can be delivered through the local APIC. + */ + if (vector < 16 || vector > 255) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + if (vlapic_set_intr_ready(vlapic, vector, level)) + vcpu_notify_event(vm, cpu, true); + return (0); +} + +int +lapic_set_local_intr(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + cpuset_t dmask; + int error; + + if (cpu < -1 || cpu >= vm_get_maxcpus(vm)) + return (EINVAL); + + if (cpu == -1) + dmask = vm_active_cpus(vm); + else + CPU_SETOF(cpu, &dmask); + error = 0; + while ((cpu = CPU_FFS(&dmask)) != 0) { + cpu--; + CPU_CLR(cpu, &dmask); + vlapic = vm_lapic(vm, cpu); + error = vlapic_trigger_lvt(vlapic, vector); + if (error) + break; + } + + return (error); +} + +int +lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg) +{ + int delmode, vec; + uint32_t dest; + bool phys; + + VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg); + + if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) { + VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr); + return (-1); + } + + /* + * Extract the x86-specific fields from the MSI addr/msg + * params according to the Intel Arch spec, Vol3 Ch 10. + * + * The PCI specification does not support level triggered + * MSI/MSI-X so ignore trigger level in 'msg'. + * + * The 'dest' is interpreted as a logical APIC ID if both + * the Redirection Hint and Destination Mode are '1' and + * physical otherwise. + */ + dest = (addr >> 12) & 0xff; + phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) != + (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)); + delmode = msg & APIC_DELMODE_MASK; + vec = msg & 0xff; + + VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d", + phys ? "physical" : "logical", dest, vec); + + vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec); + return (0); +} + +static boolean_t +x2apic_msr(u_int msr) +{ + if (msr >= 0x800 && msr <= 0xBFF) + return (TRUE); + else + return (FALSE); +} + +static u_int +x2apic_msr_to_regoff(u_int msr) +{ + + return ((msr - 0x800) << 4); +} + +boolean_t +lapic_msr(u_int msr) +{ + + if (x2apic_msr(msr) || (msr == MSR_APICBASE)) + return (TRUE); + else + return (FALSE); +} + +int +lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, bool *retu) +{ + int error; + u_int offset; + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (msr == MSR_APICBASE) { + *rval = vlapic_get_apicbase(vlapic); + error = 0; + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_read(vlapic, 0, offset, rval, retu); + } + + return (error); +} + +int +lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val, bool *retu) +{ + int error; + u_int offset; + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (msr == MSR_APICBASE) { + error = vlapic_set_apicbase(vlapic, val); + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_write(vlapic, 0, offset, val, retu); + } + + return (error); +} + +int +lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; + + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses must be 4 bytes wide and + * aligned on a 16-byte boundary. + */ + if (size != 4 || off & 0xf) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + error = vlapic_write(vlapic, 1, off, wval, arg); + return (error); +} + +int +lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; + + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses should be aligned on a + * 16-byte boundary. They are also suggested to be 4 bytes + * wide, alas not all OSes follow suggestions. + */ + off &= ~3; + if (off & 0xf) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + error = vlapic_read(vlapic, 1, off, rval, arg); + return (error); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h new file mode 100644 index 0000000000..da3b0ff660 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h @@ -0,0 +1,89 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _VMM_LAPIC_H_ +#define _VMM_LAPIC_H_ + +struct vm; + +boolean_t lapic_msr(u_int num); +int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, + bool *retu); +int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval, + bool *retu); + +int lapic_mmio_read(void *vm, int cpu, uint64_t gpa, + uint64_t *rval, int size, void *arg); +int lapic_mmio_write(void *vm, int cpu, uint64_t gpa, + uint64_t wval, int size, void *arg); + +/* + * Signals to the LAPIC that an interrupt at 'vector' needs to be generated + * to the 'cpu', the state is recorded in IRR. + */ +int lapic_set_intr(struct vm *vm, int cpu, int vector, bool trig); + +#define LAPIC_TRIG_LEVEL true +#define LAPIC_TRIG_EDGE false +static __inline int +lapic_intr_level(struct vm *vm, int cpu, int vector) +{ + + return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_LEVEL)); +} + +static __inline int +lapic_intr_edge(struct vm *vm, int cpu, int vector) +{ + + return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_EDGE)); +} + +/* + * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'. 'cpu' can + * be set to -1 to trigger the interrupt on all CPUs. + */ +int lapic_set_local_intr(struct vm *vm, int cpu, int vector); + +int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg); + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_mem.c new file mode 100644 index 0000000000..a736d94bba --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.c @@ -0,0 +1,124 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/sglist.h> +#include <sys/lock.h> +#include <sys/rwlock.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> + +#include <machine/md_var.h> + +#include "vmm_mem.h" + +int +vmm_mem_init(void) +{ + + return (0); +} + +vm_object_t +vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa) +{ + int error; + vm_object_t obj; + struct sglist *sg; + + sg = sglist_alloc(1, M_WAITOK); + error = sglist_append_phys(sg, hpa, len); + KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); + + obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL); + if (obj != NULL) { + /* + * VT-x ignores the MTRR settings when figuring out the + * memory type for translations obtained through EPT. + * + * Therefore we explicitly force the pages provided by + * this object to be mapped as uncacheable. + */ + VM_OBJECT_WLOCK(obj); + error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); + VM_OBJECT_WUNLOCK(obj); + if (error != KERN_SUCCESS) { + panic("vmm_mmio_alloc: vm_object_set_memattr error %d", + error); + } + error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, + VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + obj = NULL; + } + } + + /* + * Drop the reference on the sglist. + * + * If the scatter/gather object was successfully allocated then it + * has incremented the reference count on the sglist. Dropping the + * initial reference count ensures that the sglist will be freed + * when the object is deallocated. + * + * If the object could not be allocated then we end up freeing the + * sglist. + */ + sglist_free(sg); + + return (obj); +} + +void +vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) +{ + + vm_map_remove(&vmspace->vm_map, gpa, gpa + len); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (ptoa(Maxmem)); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.h b/usr/src/uts/i86pc/io/vmm/vmm_mem.h new file mode 100644 index 0000000000..e6f88fb222 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.h @@ -0,0 +1,55 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _VMM_MEM_H_ +#define _VMM_MEM_H_ + +struct vmspace; +struct vm_object; + +int vmm_mem_init(void); +struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); +void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); +vm_paddr_t vmm_mem_maxaddr(void); + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c new file mode 100644 index 0000000000..634575427e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -0,0 +1,2261 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/cpuvar.h> +#include <sys/ioccom.h> +#include <sys/stat.h> +#include <sys/vmsystm.h> +#include <sys/ddi.h> +#include <sys/mkdev.h> +#include <sys/sunddi.h> +#include <sys/fs/dv_node.h> +#include <sys/cpuset.h> +#include <sys/id_space.h> +#include <sys/fs/sdev_plugin.h> +#include <sys/smt.h> + +#include <sys/kernel.h> +#include <sys/hma.h> +#include <sys/x86_archext.h> + +#include <sys/vmm.h> +#include <sys/vmm_instruction_emul.h> +#include <sys/vmm_dev.h> +#include <sys/vmm_impl.h> +#include <sys/vmm_drv.h> + +#include <vm/vm.h> +#include <vm/seg_dev.h> + +#include "io/ppt.h" +#include "io/vatpic.h" +#include "io/vioapic.h" +#include "io/vrtc.h" +#include "io/vhpet.h" +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_util.h" +#include "vm/vm_glue.h" + +/* + * Locking details: + * + * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is + * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data + * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire + * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to + * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. + */ + +static kmutex_t vmmdev_mtx; +static dev_info_t *vmmdev_dip; +static hma_reg_t *vmmdev_hma_reg; +static sdev_plugin_hdl_t vmmdev_sdev_hdl; + +static kmutex_t vmm_mtx; +static list_t vmm_list; +static list_t vmm_destroy_list; +static id_space_t *vmm_minors; +static void *vmm_statep; + +static const char *vmmdev_hvm_name = "bhyve"; + +/* For sdev plugin (/dev) */ +#define VMM_SDEV_ROOT "/dev/vmm" + +/* From uts/i86pc/io/vmm/intel/vmx.c */ +extern int vmx_x86_supported(const char **); + +/* Holds and hooks from drivers external to vmm */ +struct vmm_hold { + list_node_t vmh_node; + vmm_softc_t *vmh_sc; + boolean_t vmh_release_req; + uint_t vmh_ioport_hook_cnt; +}; + +struct vmm_lease { + list_node_t vml_node; + struct vm *vml_vm; + boolean_t vml_expired; + boolean_t (*vml_expire_func)(void *); + void *vml_expire_arg; + list_node_t vml_expire_node; + struct vmm_hold *vml_hold; +}; + +static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); +static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *); + +static int +vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) +{ + int error; + bool sysmem; + + error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, + NULL); + if (error || mseg->len == 0) + return (error); + + if (!sysmem) { + vmm_devmem_entry_t *de; + list_t *dl = &sc->vmm_devmem_list; + + for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { + if (de->vde_segid == mseg->segid) { + break; + } + } + if (de != NULL) { + (void) strlcpy(mseg->name, de->vde_name, + sizeof (mseg->name)); + } + } else { + bzero(mseg->name, sizeof (mseg->name)); + } + + return (error); +} + +/* + * The 'devmem' hack: + * + * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments + * in the vm which appear with their own name related to the vm under /dev. + * Since this would be a hassle from an sdev perspective and would require a + * new cdev interface (or complicate the existing one), we choose to implement + * this in a different manner. When 'devmem' mappings are created, an + * identifying off_t is communicated back out to userspace. That off_t, + * residing above the normal guest memory space, can be used to mmap the + * 'devmem' mapping from the already-open vm device. + */ + +static int +vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) +{ + off_t map_offset; + vmm_devmem_entry_t *entry; + + if (list_is_empty(&sc->vmm_devmem_list)) { + map_offset = VM_DEVMEM_START; + } else { + entry = list_tail(&sc->vmm_devmem_list); + map_offset = entry->vde_off + entry->vde_len; + if (map_offset < entry->vde_off) { + /* Do not tolerate overflow */ + return (ERANGE); + } + /* + * XXXJOY: We could choose to search the list for duplicate + * names and toss an error. Since we're using the offset + * method for now, it does not make much of a difference. + */ + } + + entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); + entry->vde_segid = mseg->segid; + entry->vde_len = mseg->len; + entry->vde_off = map_offset; + (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); + list_insert_tail(&sc->vmm_devmem_list, entry); + + return (0); +} + +static boolean_t +vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp) +{ + list_t *dl = &sc->vmm_devmem_list; + vmm_devmem_entry_t *de = NULL; + + VERIFY(off >= VM_DEVMEM_START); + + for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { + /* XXX: Only hit on direct offset/length matches for now */ + if (de->vde_off == off && de->vde_len == len) { + break; + } + } + if (de == NULL) { + return (B_FALSE); + } + + *segidp = de->vde_segid; + return (B_TRUE); +} + +static void +vmmdev_devmem_purge(vmm_softc_t *sc) +{ + vmm_devmem_entry_t *entry; + + while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { + kmem_free(entry, sizeof (*entry)); + } +} + +static int +vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) +{ + int error; + bool sysmem = true; + + if (VM_MEMSEG_NAME(mseg)) { + sysmem = false; + } + error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); + + if (error == 0 && VM_MEMSEG_NAME(mseg)) { + /* + * Rather than create a whole fresh device from which userspace + * can mmap this segment, instead make it available at an + * offset above where the main guest memory resides. + */ + error = vmmdev_devmem_create(sc, mseg, mseg->name); + if (error != 0) { + vm_free_memseg(sc->vmm_vm, mseg->segid); + } + } + return (error); +} + +/* + * Resource Locking and Exclusion + * + * Much of bhyve depends on key portions of VM state, such as the guest memory + * map, to remain unchanged while the guest is running. As ported from + * FreeBSD, the initial strategy for this resource exclusion hinged on gating + * access to the instance vCPUs. Threads acting on a single vCPU, like those + * performing the work of actually running the guest in VMX/SVM, would lock + * only that vCPU during ioctl() entry. For ioctls which would change VM-wide + * state, all of the vCPUs would be first locked, ensuring that the + * operation(s) could complete without any other threads stumbling into + * intermediate states. + * + * This approach is largely effective for bhyve. Common operations, such as + * running the vCPUs, steer clear of lock contention. The model begins to + * break down for operations which do not occur in the context of a specific + * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker + * thread in the bhyve process. In order to properly protect those vCPU-less + * operations from encountering invalid states, additional locking is required. + * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. + * It does mean that class of operations will be serialized on locking the + * specific vCPU and that instances sized at VM_MAXCPU will potentially see + * undue contention on the VM_MAXCPU-1 vCPU. + * + * In order to address the shortcomings of this model, the concept of a + * read/write lock has been added to bhyve. Operations which change + * fundamental aspects of a VM (such as the memory map) must acquire the write + * lock, which also implies locking all of the vCPUs and waiting for all read + * lock holders to release. While it increases the cost and waiting time for + * those few operations, it allows most hot-path operations on the VM (which + * depend on its configuration remaining stable) to occur with minimal locking. + * + * Consumers of the Driver API (see below) are a special case when it comes to + * this locking, since they may hold a read lock via the drv_lease mechanism + * for an extended period of time. Rather than forcing those consumers to + * continuously poll for a write lock attempt, the lease system forces them to + * provide a release callback to trigger their clean-up (and potential later + * reacquisition) of the read lock. + */ + +static void +vcpu_lock_one(vmm_softc_t *sc, int vcpu) +{ + ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); + + /* + * Since this state transition is utilizing from_idle=true, it should + * not fail, but rather block until it can be successful. + */ + VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); +} + +static void +vcpu_unlock_one(vmm_softc_t *sc, int vcpu) +{ + ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); + + VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); + vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false); +} + +static void +vmm_read_lock(vmm_softc_t *sc) +{ + rw_enter(&sc->vmm_rwlock, RW_READER); +} + +static void +vmm_read_unlock(vmm_softc_t *sc) +{ + rw_exit(&sc->vmm_rwlock); +} + +static void +vmm_write_lock(vmm_softc_t *sc) +{ + int maxcpus; + + /* First lock all the vCPUs */ + maxcpus = vm_get_maxcpus(sc->vmm_vm); + for (int vcpu = 0; vcpu < maxcpus; vcpu++) { + vcpu_lock_one(sc, vcpu); + } + + mutex_enter(&sc->vmm_lease_lock); + VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); + sc->vmm_lease_blocker++; + if (sc->vmm_lease_blocker == 1) { + list_t *list = &sc->vmm_lease_list; + vmm_lease_t *lease = list_head(list); + + while (lease != NULL) { + boolean_t sync_break = B_FALSE; + + if (!lease->vml_expired) { + void *arg = lease->vml_expire_arg; + lease->vml_expired = B_TRUE; + sync_break = lease->vml_expire_func(arg); + } + + if (sync_break) { + vmm_lease_t *next; + + /* + * These leases which are synchronously broken + * result in vmm_read_unlock() calls from a + * different thread than the corresponding + * vmm_read_lock(). This is acceptable, given + * that the rwlock underpinning the whole + * mechanism tolerates the behavior. This + * flexibility is _only_ afforded to VM read + * lock (RW_READER) holders. + */ + next = list_next(list, lease); + vmm_lease_break_locked(sc, lease); + lease = next; + } else { + lease = list_next(list, lease); + } + } + } + mutex_exit(&sc->vmm_lease_lock); + + rw_enter(&sc->vmm_rwlock, RW_WRITER); + /* + * For now, the 'maxcpus' value for an instance is fixed at the + * compile-time constant of VM_MAXCPU at creation. If this changes in + * the future, allowing for dynamic vCPU resource sizing, acquisition + * of the write lock will need to be wary of such changes. + */ + VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); +} + +static void +vmm_write_unlock(vmm_softc_t *sc) +{ + int maxcpus; + + mutex_enter(&sc->vmm_lease_lock); + VERIFY3U(sc->vmm_lease_blocker, !=, 0); + sc->vmm_lease_blocker--; + if (sc->vmm_lease_blocker == 0) { + cv_broadcast(&sc->vmm_lease_cv); + } + mutex_exit(&sc->vmm_lease_lock); + + /* + * The VM write lock _must_ be released from the same thread it was + * acquired in, unlike the read lock. + */ + VERIFY(rw_write_held(&sc->vmm_rwlock)); + rw_exit(&sc->vmm_rwlock); + + /* Unlock all the vCPUs */ + maxcpus = vm_get_maxcpus(sc->vmm_vm); + for (int vcpu = 0; vcpu < maxcpus; vcpu++) { + vcpu_unlock_one(sc, vcpu); + } +} + +static int +vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, + cred_t *credp, int *rvalp) +{ + int error = 0, vcpu = -1; + void *datap = (void *)arg; + enum vm_lock_type { + LOCK_NONE = 0, + LOCK_VCPU, + LOCK_READ_HOLD, + LOCK_WRITE_HOLD + } lock_type = LOCK_NONE; + + /* Acquire any exclusion resources needed for the operation. */ + switch (cmd) { + case VM_RUN: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + case VM_GET_SEGMENT_DESCRIPTOR: + case VM_SET_SEGMENT_DESCRIPTOR: + case VM_GET_REGISTER_SET: + case VM_SET_REGISTER_SET: + case VM_INJECT_EXCEPTION: + case VM_GET_CAPABILITY: + case VM_SET_CAPABILITY: + case VM_PPTDEV_MSI: + case VM_PPTDEV_MSIX: + case VM_SET_X2APIC_STATE: + case VM_GLA2GPA: + case VM_GLA2GPA_NOFAULT: + case VM_ACTIVATE_CPU: + case VM_SET_INTINFO: + case VM_GET_INTINFO: + case VM_RESTART_INSTRUCTION: + /* + * Copy in the ID of the vCPU chosen for this operation. + * Since a nefarious caller could update their struct between + * this locking and when the rest of the ioctl data is copied + * in, it is _critical_ that this local 'vcpu' variable be used + * rather than the in-struct one when performing the ioctl. + */ + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { + return (EFAULT); + } + if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { + return (EINVAL); + } + vcpu_lock_one(sc, vcpu); + lock_type = LOCK_VCPU; + break; + + case VM_REINIT: + case VM_BIND_PPTDEV: + case VM_UNBIND_PPTDEV: + case VM_MAP_PPTDEV_MMIO: + case VM_ALLOC_MEMSEG: + case VM_MMAP_MEMSEG: + case VM_WRLOCK_CYCLE: + vmm_write_lock(sc); + lock_type = LOCK_WRITE_HOLD; + break; + + case VM_GET_GPA_PMAP: + case VM_GET_MEMSEG: + case VM_MMAP_GETNEXT: + case VM_LAPIC_IRQ: + case VM_INJECT_NMI: + case VM_IOAPIC_ASSERT_IRQ: + case VM_IOAPIC_DEASSERT_IRQ: + case VM_IOAPIC_PULSE_IRQ: + case VM_LAPIC_MSI: + case VM_LAPIC_LOCAL_IRQ: + case VM_GET_X2APIC_STATE: + case VM_RTC_READ: + case VM_RTC_WRITE: + case VM_RTC_SETTIME: + case VM_RTC_GETTIME: +#ifndef __FreeBSD__ + case VM_DEVMEM_GETOFFSET: +#endif + vmm_read_lock(sc); + lock_type = LOCK_READ_HOLD; + break; + + case VM_IOAPIC_PINCOUNT: + default: + break; + } + + /* Execute the primary logic for the ioctl. */ + switch (cmd) { + case VM_RUN: { + struct vm_run vmrun; + + if (ddi_copyin(datap, &vmrun, sizeof (vmrun), md)) { + error = EFAULT; + break; + } + vmrun.cpuid = vcpu; + + if (!(curthread->t_schedflag & TS_VCPU)) + smt_mark_as_vcpu(); + + error = vm_run(sc->vmm_vm, &vmrun); + /* + * XXXJOY: I think it's necessary to do copyout, even in the + * face of errors, since the exit state is communicated out. + */ + if (ddi_copyout(&vmrun, datap, sizeof (vmrun), md)) { + error = EFAULT; + break; + } + break; + } + case VM_SUSPEND: { + struct vm_suspend vmsuspend; + + if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { + error = EFAULT; + break; + } + error = vm_suspend(sc->vmm_vm, vmsuspend.how); + break; + } + case VM_REINIT: + if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { + /* + * The VM instance should be free of driver-attached + * hooks during the reinitialization process. + */ + break; + } + error = vm_reinit(sc->vmm_vm); + (void) vmm_drv_block_hook(sc, B_FALSE); + break; + case VM_STAT_DESC: { + struct vm_stat_desc statdesc; + + if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { + error = EFAULT; + break; + } + error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, + sizeof (statdesc.desc)); + if (error == 0 && + ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { + error = EFAULT; + break; + } + break; + } + case VM_STATS_IOC: { + struct vm_stats vmstats; + + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); + if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { + error = EFAULT; + break; + } + hrt2tv(gethrtime(), &vmstats.tv); + error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, + &vmstats.num_entries, vmstats.statbuf); + if (error == 0 && + ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { + error = EFAULT; + break; + } + break; + } + + case VM_PPTDEV_MSI: { + struct vm_pptdev_msi pptmsi; + + if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { + error = EFAULT; + break; + } + error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, + pptmsi.addr, pptmsi.msg, pptmsi.numvec); + break; + } + case VM_PPTDEV_MSIX: { + struct vm_pptdev_msix pptmsix; + + if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { + error = EFAULT; + break; + } + error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, + pptmsix.idx, pptmsix.addr, pptmsix.msg, + pptmsix.vector_control); + break; + } + case VM_MAP_PPTDEV_MMIO: { + struct vm_pptdev_mmio pptmmio; + + if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { + error = EFAULT; + break; + } + error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, + pptmmio.len, pptmmio.hpa); + break; + } + case VM_BIND_PPTDEV: { + struct vm_pptdev pptdev; + + if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { + error = EFAULT; + break; + } + error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); + break; + } + case VM_UNBIND_PPTDEV: { + struct vm_pptdev pptdev; + + if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { + error = EFAULT; + break; + } + error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); + break; + } + case VM_GET_PPTDEV_LIMITS: { + struct vm_pptdev_limits pptlimits; + + if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { + error = EFAULT; + break; + } + error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, + &pptlimits.msi_limit, &pptlimits.msix_limit); + if (error == 0 && + ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { + error = EFAULT; + break; + } + break; + } + case VM_INJECT_EXCEPTION: { + struct vm_exception vmexc; + if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { + error = EFAULT; + break; + } + error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, + vmexc.error_code_valid, vmexc.error_code, + vmexc.restart_instruction); + break; + } + case VM_INJECT_NMI: { + struct vm_nmi vmnmi; + + if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { + error = EFAULT; + break; + } + error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); + break; + } + case VM_LAPIC_IRQ: { + struct vm_lapic_irq vmirq; + + if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { + error = EFAULT; + break; + } + error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); + break; + } + case VM_LAPIC_LOCAL_IRQ: { + struct vm_lapic_irq vmirq; + + if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { + error = EFAULT; + break; + } + error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, + vmirq.vector); + break; + } + case VM_LAPIC_MSI: { + struct vm_lapic_msi vmmsi; + + if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { + error = EFAULT; + break; + } + error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); + break; + } + + case VM_IOAPIC_ASSERT_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; + } + error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); + break; + } + case VM_IOAPIC_DEASSERT_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; + } + error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); + break; + } + case VM_IOAPIC_PULSE_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; + } + error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); + break; + } + case VM_IOAPIC_PINCOUNT: { + int pincount; + + pincount = vioapic_pincount(sc->vmm_vm); + if (ddi_copyout(&pincount, datap, sizeof (int), md)) { + error = EFAULT; + break; + } + break; + } + + case VM_ISA_ASSERT_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; + } + error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_assert_irq(sc->vmm_vm, + isa_irq.ioapic_irq); + } + break; + } + case VM_ISA_DEASSERT_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; + } + error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_deassert_irq(sc->vmm_vm, + isa_irq.ioapic_irq); + } + break; + } + case VM_ISA_PULSE_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; + } + error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_pulse_irq(sc->vmm_vm, + isa_irq.ioapic_irq); + } + break; + } + case VM_ISA_SET_IRQ_TRIGGER: { + struct vm_isa_irq_trigger isa_irq_trigger; + + if (ddi_copyin(datap, &isa_irq_trigger, + sizeof (isa_irq_trigger), md)) { + error = EFAULT; + break; + } + error = vatpic_set_irq_trigger(sc->vmm_vm, + isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); + break; + } + + case VM_MMAP_GETNEXT: { + struct vm_memmap mm; + + if (ddi_copyin(datap, &mm, sizeof (mm), md)) { + error = EFAULT; + break; + } + error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, + &mm.segoff, &mm.len, &mm.prot, &mm.flags); + if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { + error = EFAULT; + break; + } + break; + } + case VM_MMAP_MEMSEG: { + struct vm_memmap mm; + + if (ddi_copyin(datap, &mm, sizeof (mm), md)) { + error = EFAULT; + break; + } + error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, + mm.len, mm.prot, mm.flags); + break; + } + case VM_ALLOC_MEMSEG: { + struct vm_memseg vmseg; + + if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { + error = EFAULT; + break; + } + error = vmmdev_alloc_memseg(sc, &vmseg); + break; + } + case VM_GET_MEMSEG: { + struct vm_memseg vmseg; + + if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { + error = EFAULT; + break; + } + error = vmmdev_get_memseg(sc, &vmseg); + if (error == 0 && + ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_REGISTER: { + struct vm_register vmreg; + + if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { + error = EFAULT; + break; + } + error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, + &vmreg.regval); + if (error == 0 && + ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { + error = EFAULT; + break; + } + break; + } + case VM_SET_REGISTER: { + struct vm_register vmreg; + + if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { + error = EFAULT; + break; + } + error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, + vmreg.regval); + break; + } + case VM_SET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc vmsegd; + + if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { + error = EFAULT; + break; + } + error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, + &vmsegd.desc); + break; + } + case VM_GET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc vmsegd; + + if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { + error = EFAULT; + break; + } + error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, + &vmsegd.desc); + if (error == 0 && + ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_REGISTER_SET: { + struct vm_register_set vrs; + int regnums[VM_REG_LAST]; + uint64_t regvals[VM_REG_LAST]; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; + } + if (vrs.count > VM_REG_LAST || vrs.count == 0) { + error = EINVAL; + break; + } + if (ddi_copyin(vrs.regnums, regnums, + sizeof (int) * vrs.count, md)) { + error = EFAULT; + break; + } + + error = 0; + for (uint_t i = 0; i < vrs.count && error == 0; i++) { + if (regnums[i] < 0) { + error = EINVAL; + break; + } + error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], + ®vals[i]); + } + if (error == 0 && ddi_copyout(regvals, vrs.regvals, + sizeof (uint64_t) * vrs.count, md)) { + error = EFAULT; + } + break; + } + case VM_SET_REGISTER_SET: { + struct vm_register_set vrs; + int regnums[VM_REG_LAST]; + uint64_t regvals[VM_REG_LAST]; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; + } + if (vrs.count > VM_REG_LAST || vrs.count == 0) { + error = EINVAL; + break; + } + if (ddi_copyin(vrs.regnums, regnums, + sizeof (int) * vrs.count, md)) { + error = EFAULT; + break; + } + if (ddi_copyin(vrs.regvals, regvals, + sizeof (uint64_t) * vrs.count, md)) { + error = EFAULT; + break; + } + + error = 0; + for (uint_t i = 0; i < vrs.count && error == 0; i++) { + /* + * Setting registers in a set is not atomic, since a + * failure in the middle of the set will cause a + * bail-out and inconsistent register state. Callers + * should be wary of this. + */ + if (regnums[i] < 0) { + error = EINVAL; + break; + } + error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], + regvals[i]); + } + break; + } + + case VM_GET_CAPABILITY: { + struct vm_capability vmcap; + + if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { + error = EFAULT; + break; + } + error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, + &vmcap.capval); + if (error == 0 && + ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { + error = EFAULT; + break; + } + break; + } + case VM_SET_CAPABILITY: { + struct vm_capability vmcap; + + if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { + error = EFAULT; + break; + } + error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, + vmcap.capval); + break; + } + case VM_SET_X2APIC_STATE: { + struct vm_x2apic x2apic; + + if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { + error = EFAULT; + break; + } + error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); + break; + } + case VM_GET_X2APIC_STATE: { + struct vm_x2apic x2apic; + + if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { + error = EFAULT; + break; + } + error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, + &x2apic.state); + if (error == 0 && + ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_GPA_PMAP: { + struct vm_gpa_pte gpapte; + + if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) { + error = EFAULT; + break; + } +#ifdef __FreeBSD__ + /* XXXJOY: add function? */ + pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)), + gpapte.gpa, gpapte.pte, &gpapte.ptenum); +#endif + error = 0; + break; + } + case VM_GET_HPET_CAPABILITIES: { + struct vm_hpet_cap hpetcap; + + error = vhpet_getcap(&hpetcap); + if (error == 0 && + ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GLA2GPA: { + struct vm_gla2gpa gg; + + CTASSERT(PROT_READ == VM_PROT_READ); + CTASSERT(PROT_WRITE == VM_PROT_WRITE); + CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); + + if (ddi_copyin(datap, &gg, sizeof (gg), md)) { + error = EFAULT; + break; + } + gg.vcpuid = vcpu; + error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, + gg.prot, &gg.gpa, &gg.fault); + if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GLA2GPA_NOFAULT: { + struct vm_gla2gpa gg; + + CTASSERT(PROT_READ == VM_PROT_READ); + CTASSERT(PROT_WRITE == VM_PROT_WRITE); + CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); + + if (ddi_copyin(datap, &gg, sizeof (gg), md)) { + error = EFAULT; + break; + } + gg.vcpuid = vcpu; + error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, + gg.gla, gg.prot, &gg.gpa, &gg.fault); + if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { + error = EFAULT; + break; + } + break; + } + + case VM_ACTIVATE_CPU: + error = vm_activate_cpu(sc->vmm_vm, vcpu); + break; + + case VM_SUSPEND_CPU: + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { + error = EFAULT; + } else { + error = vm_suspend_cpu(sc->vmm_vm, vcpu); + } + break; + + case VM_RESUME_CPU: + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { + error = EFAULT; + } else { + error = vm_resume_cpu(sc->vmm_vm, vcpu); + } + break; + + case VM_GET_CPUS: { + struct vm_cpuset vm_cpuset; + cpuset_t tempset; + void *srcp = &tempset; + int size; + + if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { + error = EFAULT; + break; + } + + /* Be more generous about sizing since our cpuset_t is large. */ + size = vm_cpuset.cpusetsize; + if (size <= 0 || size > sizeof (cpuset_t)) { + error = ERANGE; + } + /* + * If they want a ulong_t or less, make sure they receive the + * low bits with all the useful information. + */ + if (size <= sizeof (tempset.cpub[0])) { + srcp = &tempset.cpub[0]; + } + + if (vm_cpuset.which == VM_ACTIVE_CPUS) { + tempset = vm_active_cpus(sc->vmm_vm); + } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { + tempset = vm_suspended_cpus(sc->vmm_vm); + } else if (vm_cpuset.which == VM_DEBUG_CPUS) { + tempset = vm_debug_cpus(sc->vmm_vm); + } else { + error = EINVAL; + } + + ASSERT(size > 0 && size <= sizeof (tempset)); + if (error == 0 && + ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { + error = EFAULT; + break; + } + break; + } + case VM_SET_INTINFO: { + struct vm_intinfo vmii; + + if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { + error = EFAULT; + break; + } + error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); + break; + } + case VM_GET_INTINFO: { + struct vm_intinfo vmii; + + vmii.vcpuid = vcpu; + error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, + &vmii.info2); + if (error == 0 && + ddi_copyout(&vmii, datap, sizeof (vmii), md)) { + error = EFAULT; + break; + } + break; + } + case VM_RTC_WRITE: { + struct vm_rtc_data rtcdata; + + if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, + rtcdata.value); + break; + } + case VM_RTC_READ: { + struct vm_rtc_data rtcdata; + + if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, + &rtcdata.value); + if (error == 0 && + ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + break; + } + case VM_RTC_SETTIME: { + struct vm_rtc_time rtctime; + + if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { + error = EFAULT; + break; + } + error = vrtc_set_time(sc->vmm_vm, rtctime.secs); + break; + } + case VM_RTC_GETTIME: { + struct vm_rtc_time rtctime; + + rtctime.secs = vrtc_get_time(sc->vmm_vm); + if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { + error = EFAULT; + break; + } + break; + } + + case VM_RESTART_INSTRUCTION: + error = vm_restart_instruction(sc->vmm_vm, vcpu); + break; + + case VM_SET_TOPOLOGY: { + struct vm_cpu_topology topo; + + if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { + error = EFAULT; + break; + } + error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, + topo.threads, topo.maxcpus); + break; + } + case VM_GET_TOPOLOGY: { + struct vm_cpu_topology topo; + + vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, + &topo.threads, &topo.maxcpus); + if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { + error = EFAULT; + break; + } + break; + } + +#ifndef __FreeBSD__ + case VM_DEVMEM_GETOFFSET: { + struct vm_devmem_offset vdo; + list_t *dl = &sc->vmm_devmem_list; + vmm_devmem_entry_t *de = NULL; + + if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { + error = EFAULT; + break; + } + + for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { + if (de->vde_segid == vdo.segid) { + break; + } + } + if (de != NULL) { + vdo.offset = de->vde_off; + if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { + error = EFAULT; + } + } else { + error = ENOENT; + } + break; + } + case VM_WRLOCK_CYCLE: { + /* + * Present a test mechanism to acquire/release the write lock + * on the VM without any other effects. + */ + break; + } +#endif + default: + error = ENOTTY; + break; + } + + /* Release exclusion resources */ + switch (lock_type) { + case LOCK_NONE: + break; + case LOCK_VCPU: + vcpu_unlock_one(sc, vcpu); + break; + case LOCK_READ_HOLD: + vmm_read_unlock(sc); + break; + case LOCK_WRITE_HOLD: + vmm_write_unlock(sc); + break; + default: + panic("unexpected lock type"); + break; + } + + return (error); +} + +static vmm_softc_t * +vmm_lookup(const char *name) +{ + list_t *vml = &vmm_list; + vmm_softc_t *sc; + + ASSERT(MUTEX_HELD(&vmm_mtx)); + + for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { + if (strcmp(sc->vmm_name, name) == 0) { + break; + } + } + + return (sc); +} + +static int +vmmdev_do_vm_create(char *name, cred_t *cr) +{ + vmm_softc_t *sc = NULL; + minor_t minor; + int error = ENOMEM; + + if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) { + return (EINVAL); + } + + mutex_enter(&vmm_mtx); + + /* Look for duplicates names */ + if (vmm_lookup(name) != NULL) { + mutex_exit(&vmm_mtx); + return (EEXIST); + } + + /* Allow only one instance per non-global zone. */ + if (!INGLOBALZONE(curproc)) { + for (sc = list_head(&vmm_list); sc != NULL; + sc = list_next(&vmm_list, sc)) { + if (sc->vmm_zone == curzone) { + mutex_exit(&vmm_mtx); + return (EINVAL); + } + } + } + + minor = id_alloc(vmm_minors); + if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { + goto fail; + } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { + ddi_soft_state_free(vmm_statep, minor); + goto fail; + } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + goto fail; + } + + error = vm_create(name, &sc->vmm_vm); + if (error == 0) { + /* Complete VM intialization and report success. */ + (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); + sc->vmm_minor = minor; + list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), + offsetof(vmm_devmem_entry_t, vde_node)); + + list_create(&sc->vmm_holds, sizeof (vmm_hold_t), + offsetof(vmm_hold_t, vmh_node)); + cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), + offsetof(vmm_lease_t, vml_node)); + cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); + rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); + + sc->vmm_zone = crgetzone(cr); + zone_hold(sc->vmm_zone); + vmm_zsd_add_vm(sc); + + list_insert_tail(&vmm_list, sc); + mutex_exit(&vmm_mtx); + return (0); + } + + ddi_remove_minor_node(vmmdev_dip, name); +fail: + id_free(vmm_minors, minor); + if (sc != NULL) { + ddi_soft_state_free(vmm_statep, minor); + } + mutex_exit(&vmm_mtx); + + return (error); +} + +/* + * Bhyve 'Driver' Interface + * + * While many devices are emulated in the bhyve userspace process, there are + * others with performance constraints which require that they run mostly or + * entirely in-kernel. For those not integrated directly into bhyve, an API is + * needed so they can query/manipulate the portions of VM state needed to + * fulfill their purpose. + * + * This includes: + * - Translating guest-physical addresses to host-virtual pointers + * - Injecting MSIs + * - Hooking IO port addresses + * + * The vmm_drv interface exists to provide that functionality to its consumers. + * (At this time, 'viona' is the only user) + */ +int +vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) +{ + vnode_t *vp = fp->f_vnode; + const dev_t dev = vp->v_rdev; + vmm_softc_t *sc; + vmm_hold_t *hold; + int err = 0; + + if (vp->v_type != VCHR) { + return (ENXIO); + } + const major_t major = getmajor(dev); + const minor_t minor = getminor(dev); + + mutex_enter(&vmmdev_mtx); + if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { + mutex_exit(&vmmdev_mtx); + return (ENOENT); + } + mutex_enter(&vmm_mtx); + mutex_exit(&vmmdev_mtx); + + if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { + err = ENOENT; + goto out; + } + /* XXXJOY: check cred permissions against instance */ + + if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { + err = EBUSY; + goto out; + } + + hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); + hold->vmh_sc = sc; + hold->vmh_release_req = B_FALSE; + + list_insert_tail(&sc->vmm_holds, hold); + sc->vmm_flags |= VMM_HELD; + *holdp = hold; + +out: + mutex_exit(&vmm_mtx); + return (err); +} + +void +vmm_drv_rele(vmm_hold_t *hold) +{ + vmm_softc_t *sc; + + ASSERT(hold != NULL); + ASSERT(hold->vmh_sc != NULL); + VERIFY(hold->vmh_ioport_hook_cnt == 0); + + mutex_enter(&vmm_mtx); + sc = hold->vmh_sc; + list_remove(&sc->vmm_holds, hold); + if (list_is_empty(&sc->vmm_holds)) { + sc->vmm_flags &= ~VMM_HELD; + cv_broadcast(&sc->vmm_cv); + } + mutex_exit(&vmm_mtx); + kmem_free(hold, sizeof (*hold)); +} + +boolean_t +vmm_drv_release_reqd(vmm_hold_t *hold) +{ + ASSERT(hold != NULL); + + return (hold->vmh_release_req); +} + +vmm_lease_t * +vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) +{ + vmm_softc_t *sc = hold->vmh_sc; + vmm_lease_t *lease; + + ASSERT3P(expiref, !=, NULL); + + if (hold->vmh_release_req) { + return (NULL); + } + + lease = kmem_alloc(sizeof (*lease), KM_SLEEP); + list_link_init(&lease->vml_node); + lease->vml_expire_func = expiref; + lease->vml_expire_arg = arg; + lease->vml_expired = B_FALSE; + lease->vml_hold = hold; + /* cache the VM pointer for one less pointer chase */ + lease->vml_vm = sc->vmm_vm; + + mutex_enter(&sc->vmm_lease_lock); + while (sc->vmm_lease_blocker != 0) { + cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); + } + list_insert_tail(&sc->vmm_lease_list, lease); + vmm_read_lock(sc); + mutex_exit(&sc->vmm_lease_lock); + + return (lease); +} + +static void +vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) +{ + ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); + + list_remove(&sc->vmm_lease_list, lease); + vmm_read_unlock(sc); + kmem_free(lease, sizeof (*lease)); +} + +void +vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) +{ + vmm_softc_t *sc = hold->vmh_sc; + + VERIFY3P(hold, ==, lease->vml_hold); + + mutex_enter(&sc->vmm_lease_lock); + vmm_lease_break_locked(sc, lease); + mutex_exit(&sc->vmm_lease_lock); +} + +boolean_t +vmm_drv_lease_expired(vmm_lease_t *lease) +{ + return (lease->vml_expired); +} + +void * +vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz) +{ + ASSERT(lease != NULL); + + return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz)); +} + +int +vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) +{ + ASSERT(lease != NULL); + + return (lapic_intr_msi(lease->vml_vm, addr, msg)); +} + +int +vmm_drv_ioport_hook(vmm_hold_t *hold, uint_t ioport, vmm_drv_rmem_cb_t rfunc, + vmm_drv_wmem_cb_t wfunc, void *arg, void **cookie) +{ + vmm_softc_t *sc; + int err; + + ASSERT(hold != NULL); + ASSERT(cookie != NULL); + + sc = hold->vmh_sc; + mutex_enter(&vmm_mtx); + /* Confirm that hook installation is not blocked */ + if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { + mutex_exit(&vmm_mtx); + return (EBUSY); + } + /* + * Optimistically record an installed hook which will prevent a block + * from being asserted while the mutex is dropped. + */ + hold->vmh_ioport_hook_cnt++; + mutex_exit(&vmm_mtx); + + vmm_write_lock(sc); + err = vm_ioport_hook(sc->vmm_vm, ioport, (vmm_rmem_cb_t)rfunc, + (vmm_wmem_cb_t)wfunc, arg, cookie); + vmm_write_unlock(sc); + + if (err != 0) { + mutex_enter(&vmm_mtx); + /* Walk back optimism about the hook installation */ + hold->vmh_ioport_hook_cnt--; + mutex_exit(&vmm_mtx); + } + return (err); +} + +void +vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) +{ + vmm_softc_t *sc; + + ASSERT(hold != NULL); + ASSERT(cookie != NULL); + ASSERT(hold->vmh_ioport_hook_cnt != 0); + + sc = hold->vmh_sc; + vmm_write_lock(sc); + vm_ioport_unhook(sc->vmm_vm, cookie); + vmm_write_unlock(sc); + + mutex_enter(&vmm_mtx); + hold->vmh_ioport_hook_cnt--; + mutex_exit(&vmm_mtx); +} + +static int +vmm_drv_purge(vmm_softc_t *sc) +{ + ASSERT(MUTEX_HELD(&vmm_mtx)); + + if ((sc->vmm_flags & VMM_HELD) != 0) { + vmm_hold_t *hold; + + sc->vmm_flags |= VMM_CLEANUP; + for (hold = list_head(&sc->vmm_holds); hold != NULL; + hold = list_next(&sc->vmm_holds, hold)) { + hold->vmh_release_req = B_TRUE; + } + while ((sc->vmm_flags & VMM_HELD) != 0) { + if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { + return (EINTR); + } + } + sc->vmm_flags &= ~VMM_CLEANUP; + } + + VERIFY(list_is_empty(&sc->vmm_holds)); + sc->vmm_flags |= VMM_PURGED; + return (0); +} + +static int +vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) +{ + int err = 0; + + mutex_enter(&vmm_mtx); + if (!enable_block) { + VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); + + sc->vmm_flags &= ~VMM_BLOCK_HOOK; + goto done; + } + + /* If any holds have hooks installed, the block is a failure */ + if (!list_is_empty(&sc->vmm_holds)) { + vmm_hold_t *hold; + + for (hold = list_head(&sc->vmm_holds); hold != NULL; + hold = list_next(&sc->vmm_holds, hold)) { + if (hold->vmh_ioport_hook_cnt != 0) { + err = EBUSY; + goto done; + } + } + } + sc->vmm_flags |= VMM_BLOCK_HOOK; + +done: + mutex_exit(&vmm_mtx); + return (err); +} + +static int +vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd) +{ + dev_info_t *pdip = ddi_get_parent(vmmdev_dip); + minor_t minor; + + ASSERT(MUTEX_HELD(&vmm_mtx)); + + if (clean_zsd) { + vmm_zsd_rem_vm(sc); + } + + if (vmm_drv_purge(sc) != 0) { + return (EINTR); + } + + /* Clean up devmem entries */ + vmmdev_devmem_purge(sc); + + list_remove(&vmm_list, sc); + ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); + minor = sc->vmm_minor; + zone_rele(sc->vmm_zone); + if (sc->vmm_is_open) { + list_insert_tail(&vmm_destroy_list, sc); + sc->vmm_flags |= VMM_DESTROY; + } else { + vm_destroy(sc->vmm_vm); + ddi_soft_state_free(vmm_statep, minor); + id_free(vmm_minors, minor); + } + (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); + + return (0); +} + +int +vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd) +{ + int err; + + mutex_enter(&vmm_mtx); + err = vmm_do_vm_destroy_locked(sc, clean_zsd); + mutex_exit(&vmm_mtx); + + return (err); +} + +/* ARGSUSED */ +static int +vmmdev_do_vm_destroy(const char *name, cred_t *cr) +{ + vmm_softc_t *sc; + int err; + + if (crgetuid(cr) != 0) + return (EPERM); + + mutex_enter(&vmm_mtx); + + if ((sc = vmm_lookup(name)) == NULL) { + mutex_exit(&vmm_mtx); + return (ENOENT); + } + /* + * We don't check this in vmm_lookup() since that function is also used + * for validation during create and currently vmm names must be unique. + */ + if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { + mutex_exit(&vmm_mtx); + return (EPERM); + } + err = vmm_do_vm_destroy_locked(sc, B_TRUE); + mutex_exit(&vmm_mtx); + + return (err); +} + + +static int +vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + minor_t minor; + vmm_softc_t *sc; + + minor = getminor(*devp); + if (minor == VMM_CTL_MINOR) { + /* + * Master control device must be opened exclusively. + */ + if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { + return (EINVAL); + } + + return (0); + } + + mutex_enter(&vmm_mtx); + sc = ddi_get_soft_state(vmm_statep, minor); + if (sc == NULL) { + mutex_exit(&vmm_mtx); + return (ENXIO); + } + + sc->vmm_is_open = B_TRUE; + mutex_exit(&vmm_mtx); + + return (0); +} + +static int +vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + minor_t minor; + vmm_softc_t *sc; + + minor = getminor(dev); + if (minor == VMM_CTL_MINOR) + return (0); + + mutex_enter(&vmm_mtx); + sc = ddi_get_soft_state(vmm_statep, minor); + if (sc == NULL) { + mutex_exit(&vmm_mtx); + return (ENXIO); + } + + VERIFY(sc->vmm_is_open); + sc->vmm_is_open = B_FALSE; + + if (sc->vmm_flags & VMM_DESTROY) { + list_remove(&vmm_destroy_list, sc); + vm_destroy(sc->vmm_vm); + ddi_soft_state_free(vmm_statep, minor); + id_free(vmm_minors, minor); + } + mutex_exit(&vmm_mtx); + + return (0); +} + +static int +vmm_is_supported(intptr_t arg) +{ + int r; + const char *msg; + + if (vmm_is_intel()) { + r = vmx_x86_supported(&msg); + } else if (vmm_is_amd()) { + /* + * HMA already ensured that the features necessary for SVM + * operation were present and online during vmm_attach(). + */ + r = 0; + } else { + r = ENXIO; + msg = "Unsupported CPU vendor"; + } + + if (r != 0 && arg != (intptr_t)NULL) { + if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0) + return (EFAULT); + } + return (r); +} + +static int +vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + vmm_softc_t *sc; + minor_t minor; + + minor = getminor(dev); + + if (minor == VMM_CTL_MINOR) { + void *argp = (void *)arg; + char name[VM_MAX_NAMELEN] = { 0 }; + size_t len = 0; + + if ((mode & FKIOCTL) != 0) { + len = strlcpy(name, argp, sizeof (name)); + } else { + if (copyinstr(argp, name, sizeof (name), &len) != 0) { + return (EFAULT); + } + } + if (len >= VM_MAX_NAMELEN) { + return (ENAMETOOLONG); + } + + switch (cmd) { + case VMM_CREATE_VM: + if ((mode & FWRITE) == 0) + return (EPERM); + return (vmmdev_do_vm_create(name, credp)); + case VMM_DESTROY_VM: + if ((mode & FWRITE) == 0) + return (EPERM); + return (vmmdev_do_vm_destroy(name, credp)); + case VMM_VM_SUPPORTED: + return (vmm_is_supported(arg)); + default: + /* No other actions are legal on ctl device */ + return (ENOTTY); + } + } + + sc = ddi_get_soft_state(vmm_statep, minor); + ASSERT(sc); + + if (sc->vmm_flags & VMM_DESTROY) + return (ENXIO); + + return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); +} + +static int +vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, + unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) +{ + vmm_softc_t *sc; + const minor_t minor = getminor(dev); + struct vm *vm; + int err; + vm_object_t vmo = NULL; + struct vmspace *vms; + + if (minor == VMM_CTL_MINOR) { + return (ENODEV); + } + if (off < 0 || (off + len) <= 0) { + return (EINVAL); + } + if ((prot & PROT_USER) == 0) { + return (EACCES); + } + + sc = ddi_get_soft_state(vmm_statep, minor); + ASSERT(sc); + + if (sc->vmm_flags & VMM_DESTROY) + return (ENXIO); + + /* Grab read lock on the VM to prevent any changes to the memory map */ + vmm_read_lock(sc); + + vm = sc->vmm_vm; + vms = vm_get_vmspace(vm); + if (off >= VM_DEVMEM_START) { + int segid; + + /* Mapping a devmem "device" */ + if (!vmmdev_devmem_segid(sc, off, len, &segid)) { + err = ENODEV; + goto out; + } + err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); + if (err != 0) { + goto out; + } + err = vm_segmap_obj(vms, vmo, as, addrp, prot, maxprot, flags); + } else { + /* Mapping a part of the guest physical space */ + err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot, + flags); + } + + +out: + vmm_read_unlock(sc); + return (err); +} + +static sdev_plugin_validate_t +vmm_sdev_validate(sdev_ctx_t ctx) +{ + const char *name = sdev_ctx_name(ctx); + vmm_softc_t *sc; + sdev_plugin_validate_t ret; + minor_t minor; + + if (sdev_ctx_vtype(ctx) != VCHR) + return (SDEV_VTOR_INVALID); + + VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); + + mutex_enter(&vmm_mtx); + if ((sc = vmm_lookup(name)) == NULL) + ret = SDEV_VTOR_INVALID; + else if (sc->vmm_minor != minor) + ret = SDEV_VTOR_STALE; + else + ret = SDEV_VTOR_VALID; + mutex_exit(&vmm_mtx); + + return (ret); +} + +static int +vmm_sdev_filldir(sdev_ctx_t ctx) +{ + vmm_softc_t *sc; + int ret; + + if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { + cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, + sdev_ctx_path(ctx), VMM_SDEV_ROOT); + return (EINVAL); + } + + mutex_enter(&vmm_mtx); + ASSERT(vmmdev_dip != NULL); + for (sc = list_head(&vmm_list); sc != NULL; + sc = list_next(&vmm_list, sc)) { + if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { + ret = sdev_plugin_mknod(ctx, sc->vmm_name, + S_IFCHR | 0600, + makedevice(ddi_driver_major(vmmdev_dip), + sc->vmm_minor)); + } else { + continue; + } + if (ret != 0 && ret != EEXIST) + goto out; + } + + ret = 0; + +out: + mutex_exit(&vmm_mtx); + return (ret); +} + +/* ARGSUSED */ +static void +vmm_sdev_inactive(sdev_ctx_t ctx) +{ +} + +static sdev_plugin_ops_t vmm_sdev_ops = { + .spo_version = SDEV_PLUGIN_VERSION, + .spo_flags = SDEV_PLUGIN_SUBDIR, + .spo_validate = vmm_sdev_validate, + .spo_filldir = vmm_sdev_filldir, + .spo_inactive = vmm_sdev_inactive +}; + +/* ARGSUSED */ +static int +vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)vmmdev_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + return (error); +} + +static int +vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + sdev_plugin_hdl_t sph; + hma_reg_t *reg = NULL; + boolean_t vmm_loaded = B_FALSE; + + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + mutex_enter(&vmmdev_mtx); + /* Ensure we are not already attached. */ + if (vmmdev_dip != NULL) { + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + + vmm_sol_glue_init(); + vmm_arena_init(); + + if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { + goto fail; + } else if (vmm_mod_load() != 0) { + goto fail; + } + vmm_loaded = B_TRUE; + + /* Create control node. Other nodes will be created on demand. */ + if (ddi_create_minor_node(dip, "ctl", S_IFCHR, + VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { + goto fail; + } + + if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) == + (sdev_plugin_hdl_t)NULL) { + ddi_remove_minor_node(dip, NULL); + goto fail; + } + + ddi_report_dev(dip); + vmmdev_hma_reg = reg; + vmmdev_sdev_hdl = sph; + vmmdev_dip = dip; + mutex_exit(&vmmdev_mtx); + return (DDI_SUCCESS); + +fail: + if (vmm_loaded) { + VERIFY0(vmm_mod_unload()); + } + if (reg != NULL) { + hma_unregister(reg); + } + vmm_arena_fini(); + vmm_sol_glue_cleanup(); + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); +} + +static int +vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + /* + * Ensure that all resources have been cleaned up. + * + * To prevent a deadlock with iommu_cleanup() we'll fail the detach if + * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our + * devinfo locked as iommu_cleanup() tries to recursively lock each + * devinfo, including our own, while holding vmmdev_mtx. + */ + if (mutex_tryenter(&vmmdev_mtx) == 0) + return (DDI_FAILURE); + + mutex_enter(&vmm_mtx); + if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { + mutex_exit(&vmm_mtx); + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + mutex_exit(&vmm_mtx); + + VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); + if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; + + /* Remove the control node. */ + ddi_remove_minor_node(dip, "ctl"); + vmmdev_dip = NULL; + + VERIFY0(vmm_mod_unload()); + hma_unregister(vmmdev_hma_reg); + vmmdev_hma_reg = NULL; + vmm_arena_fini(); + vmm_sol_glue_cleanup(); + + mutex_exit(&vmmdev_mtx); + + return (DDI_SUCCESS); +} + +static struct cb_ops vmm_cb_ops = { + vmm_open, + vmm_close, + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + vmm_ioctl, + nodev, /* devmap */ + nodev, /* mmap */ + vmm_segmap, + nochpoll, /* poll */ + ddi_prop_op, + NULL, + D_NEW | D_MP | D_DEVMAP +}; + +static struct dev_ops vmm_ops = { + DEVO_REV, + 0, + vmm_info, + nulldev, /* identify */ + nulldev, /* probe */ + vmm_attach, + vmm_detach, + nodev, /* reset */ + &vmm_cb_ops, + (struct bus_ops *)NULL +}; + +static struct modldrv modldrv = { + &mod_driverops, + "bhyve vmm", + &vmm_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + int error; + + sysinit(); + + mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); + list_create(&vmm_list, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_node)); + list_create(&vmm_destroy_list, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_node)); + vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); + + error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); + if (error) { + return (error); + } + + vmm_zsd_init(); + + error = mod_install(&modlinkage); + if (error) { + ddi_soft_state_fini(&vmm_statep); + vmm_zsd_fini(); + } + + return (error); +} + +int +_fini(void) +{ + int error; + + error = mod_remove(&modlinkage); + if (error) { + return (error); + } + + vmm_zsd_fini(); + + ddi_soft_state_fini(&vmm_statep); + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c new file mode 100644 index 0000000000..c26e763805 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c @@ -0,0 +1,268 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/machsystm.h> + +#include <sys/gipt.h> +#include <vm/vm_glue.h> + + +struct ept_map { + gipt_map_t em_gipt; + uint64_t em_wired_page_count; +}; +typedef struct ept_map ept_map_t; + +#define EPT_LOCK(m) (&(m)->em_gipt.giptm_lock) + +#define EPT_MAX_LEVELS 4 + +CTASSERT(EPT_MAX_LEVELS <= GIPT_MAX_LEVELS); + +#define EPT_R (0x1 << 0) +#define EPT_W (0x1 << 1) +#define EPT_X (0x1 << 2) +#define EPT_RWX (EPT_R | EPT_W | EPT_X) +#define EPT_LGPG (0x1 << 7) + +#define EPT_PA_MASK (0x000ffffffffff000ull) + +CTASSERT(EPT_R == PROT_READ); +CTASSERT(EPT_W == PROT_WRITE); +CTASSERT(EPT_X == PROT_EXEC); + + +#define EPT_PAT(attr) (((attr) & 0x7) << 3) +#define EPT_PADDR(addr) ((addr) & EPT_PA_MASK) + +#define EPT_IS_ABSENT(pte) (((pte) & EPT_RWX) == 0) +#define EPT_PTE_PFN(pte) mmu_btop(EPT_PADDR(pte)) +#define EPT_PTE_PROT(pte) ((pte) & EPT_RWX) +#define EPT_MAPS_PAGE(pte, lvl) \ + (EPT_PTE_PROT(pte) != 0 && (((pte) & EPT_LGPG) != 0 || (lvl) == 0)) + +/* + * Only assign EPT_LGPG for levels higher than 0. Although this bit is defined + * as being ignored at level 0, some versions of VMWare fail to honor this and + * report such a PTE as an EPT mis-configuration. + */ +#define EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \ + (EPT_PADDR(pfn_to_pa(pfn)) | \ + (((lvl) != 0) ? EPT_LGPG : 0) | \ + EPT_PAT(attr) | ((prot) & EPT_RWX)) +#define EPT_PTE_ASSIGN_TABLE(pfn) (EPT_PADDR(pfn_to_pa(pfn)) | EPT_RWX) + + +static gipt_pte_type_t +ept_pte_type(uint64_t pte, uint_t level) +{ + if (EPT_IS_ABSENT(pte)) { + return (PTET_EMPTY); + } else if (EPT_MAPS_PAGE(pte, level)) { + return (PTET_PAGE); + } else { + return (PTET_LINK); + } +} + +static uint64_t +ept_pte_map(uint64_t pfn) +{ + return (EPT_PTE_ASSIGN_TABLE(pfn)); +} + +static void * +ept_create(uintptr_t *pml4_kaddr) +{ + ept_map_t *emap; + gipt_map_t *map; + gipt_t *root; + struct gipt_cbs cbs = { + .giptc_pte_type = ept_pte_type, + .giptc_pte_map = ept_pte_map, + }; + + emap = kmem_zalloc(sizeof (*emap), KM_SLEEP); + map = &emap->em_gipt; + root = gipt_alloc(); + root->gipt_level = EPT_MAX_LEVELS - 1; + gipt_map_init(map, EPT_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root); + + *pml4_kaddr = (uintptr_t)root->gipt_kva; + return (emap); +} + +static void +ept_destroy(void *arg) +{ + ept_map_t *emap = arg; + + if (emap != NULL) { + gipt_map_t *map = &emap->em_gipt; + + gipt_map_fini(map); + kmem_free(emap, sizeof (*emap)); + } +} + +static uint64_t +ept_wired_count(void *arg) +{ + ept_map_t *emap = arg; + uint64_t res; + + mutex_enter(EPT_LOCK(emap)); + res = emap->em_wired_page_count; + mutex_exit(EPT_LOCK(emap)); + + return (res); +} + +static int +ept_is_wired(void *arg, uint64_t va, uint_t *protp) +{ + ept_map_t *emap = arg; + gipt_t *pt; + int rv = -1; + + mutex_enter(EPT_LOCK(emap)); + pt = gipt_map_lookup_deepest(&emap->em_gipt, va); + if (pt != NULL) { + const uint64_t pte = GIPT_VA2PTE(pt, va); + + if (EPT_MAPS_PAGE(pte, pt->gipt_level)) { + *protp = EPT_PTE_PROT(pte); + rv = 0; + } + } + mutex_exit(EPT_LOCK(emap)); + + return (rv); +} + +static int +ept_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot, + uint8_t attr) +{ + ept_map_t *emap = arg; + gipt_map_t *map = &emap->em_gipt; + gipt_t *pt; + uint64_t *ptep, pte; + + ASSERT((prot & EPT_RWX) != 0 && (prot & ~EPT_RWX) == 0); + ASSERT3U(lvl, <, EPT_MAX_LEVELS); + + mutex_enter(EPT_LOCK(emap)); + pt = gipt_map_lookup(map, va, lvl); + if (pt == NULL) { + /* + * A table at the appropriate VA/level that would house this + * mapping does not currently exist. Try to walk down to that + * point, creating any necessary parent(s). + */ + pt = gipt_map_create_parents(map, va, lvl); + + /* + * There was a large page mapping in the way of creating the + * necessary parent table(s). + */ + if (pt == NULL) { + panic("unexpected large page @ %08lx", va); + } + } + ptep = GIPT_VA2PTEP(pt, va); + + pte = *ptep; + if (!EPT_IS_ABSENT(pte)) { + if (!EPT_MAPS_PAGE(pte, lvl)) { + panic("unexpected PT link @ %08lx in %p", va, pt); + } else { + panic("unexpected page mapped @ %08lx in %p", va, pt); + } + } + + pte = EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr); + *ptep = pte; + pt->gipt_valid_cnt++; + emap->em_wired_page_count += gipt_level_count[lvl]; + + mutex_exit(EPT_LOCK(emap)); + return (0); +} + +static uint64_t +ept_unmap(void *arg, uint64_t va, uint64_t end_va) +{ + ept_map_t *emap = arg; + gipt_map_t *map = &emap->em_gipt; + gipt_t *pt; + uint64_t cur_va = va; + uint64_t unmapped = 0; + + mutex_enter(EPT_LOCK(emap)); + + pt = gipt_map_lookup_deepest(map, cur_va); + if (pt == NULL) { + mutex_exit(EPT_LOCK(emap)); + return (0); + } + if (!EPT_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) { + cur_va = gipt_map_next_page(map, cur_va, end_va, &pt); + if (cur_va == 0) { + mutex_exit(EPT_LOCK(emap)); + return (0); + } + } + + while (cur_va < end_va) { + uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va); + const uint_t lvl = pt->gipt_level; + + ASSERT(EPT_MAPS_PAGE(*ptep, lvl)); + *ptep = 0; + pt->gipt_valid_cnt--; + unmapped += gipt_level_count[pt->gipt_level]; + + gipt_t *next_pt = pt; + uint64_t next_va; + next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt); + + if (pt->gipt_valid_cnt == 0) { + gipt_map_clean_parents(map, pt); + } + if (next_va == 0) { + break; + } + pt = next_pt; + cur_va = next_va; + } + emap->em_wired_page_count -= unmapped; + + mutex_exit(EPT_LOCK(emap)); + + return (unmapped); +} + +struct vmm_pt_ops ept_ops = { + .vpo_init = ept_create, + .vpo_free = ept_destroy, + .vpo_wired_cnt = ept_wired_count, + .vpo_is_wired = ept_is_wired, + .vpo_map = ept_map, + .vpo_unmap = ept_unmap, +}; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c new file mode 100644 index 0000000000..c8d5aa24e9 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c @@ -0,0 +1,703 @@ +/* + * Copyright (c) 2004 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/kern/subr_unit.c 255057 2013-08-30 07:37:45Z kib $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/archsystm.h> +#include <sys/cpuset.h> +#include <sys/fp.h> +#include <sys/malloc.h> +#include <sys/queue.h> +#include <sys/spl.h> +#include <sys/systm.h> +#include <sys/ddidmareq.h> +#include <sys/id_space.h> +#include <sys/psm_defs.h> +#include <sys/smp_impldefs.h> +#include <sys/modhash.h> +#include <sys/hma.h> + +#include <sys/x86_archext.h> + +#include <machine/cpufunc.h> +#include <machine/fpu.h> +#include <machine/md_var.h> +#include <machine/pmap.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include <sys/vmm_impl.h> +#include <sys/kernel.h> + +#include <vm/as.h> +#include <vm/seg_kmem.h> + +SET_DECLARE(sysinit_set, struct sysinit); + +void +sysinit(void) +{ + struct sysinit **si; + + SET_FOREACH(si, sysinit_set) + (*si)->func((*si)->data); +} + +u_char const bin2bcd_data[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 +}; + +void +pmap_invalidate_cache(void) +{ + cpuset_t cpuset; + + kpreempt_disable(); + cpuset_all_but(&cpuset, CPU->cpu_id); + xc_call((xc_arg_t)NULL, (xc_arg_t)NULL, (xc_arg_t)NULL, + CPUSET2BV(cpuset), (xc_func_t)invalidate_cache); + invalidate_cache(); + kpreempt_enable(); +} + +vm_paddr_t +pmap_kextract(vm_offset_t va) +{ + pfn_t pfn; + + /* + * Since hat_getpfnum() may block on an htable mutex, this is not at + * all safe to run from a critical_enter/kpreempt_disable context. + * The FreeBSD analog does not have the same locking constraints, so + * close attention must be paid wherever this is called. + */ + ASSERT(curthread->t_preempt == 0); + + pfn = hat_getpfnum(kas.a_hat, (caddr_t)va); + ASSERT(pfn != PFN_INVALID); + return (pfn << PAGE_SHIFT) | ((uintptr_t)va & PAGE_MASK); +} + +int +cpusetobj_ffs(const cpuset_t *set) +{ + uint_t large, small; + + /* + * Rather than reaching into the cpuset_t ourselves, leave that task to + * cpuset_bounds(). The simplicity is worth the extra wasted work to + * find the upper bound. + */ + cpuset_bounds(set, &small, &large); + + if (small == CPUSET_NOTINSET) { + /* The FreeBSD version returns 0 if it find nothing */ + return (0); + } + + ASSERT3U(small, <=, INT_MAX); + + /* Least significant bit index starts at 1 for valid results */ + return (small + 1); +} + +struct kmem_item { + void *addr; + size_t size; +}; +static kmutex_t kmem_items_lock; + +static mod_hash_t *vmm_alloc_hash; +uint_t vmm_alloc_hash_nchains = 16381; +uint_t vmm_alloc_hash_size = PAGESIZE; + +static void +vmm_alloc_hash_valdtor(mod_hash_val_t val) +{ + struct kmem_item *i = (struct kmem_item *)val; + + kmem_free(i->addr, i->size); + kmem_free(i, sizeof (struct kmem_item)); +} + +static void +vmm_alloc_init(void) +{ + vmm_alloc_hash = mod_hash_create_ptrhash("vmm_alloc_hash", + vmm_alloc_hash_nchains, vmm_alloc_hash_valdtor, + vmm_alloc_hash_size); + + VERIFY(vmm_alloc_hash != NULL); +} + +static uint_t +vmm_alloc_check(mod_hash_key_t key, mod_hash_val_t *val, void *unused) +{ + struct kmem_item *i = (struct kmem_item *)val; + + cmn_err(CE_PANIC, "!vmm_alloc_check: hash not empty: %p, %d", i->addr, + i->size); + + return (MH_WALK_TERMINATE); +} + +static void +vmm_alloc_cleanup(void) +{ + mod_hash_walk(vmm_alloc_hash, vmm_alloc_check, NULL); + mod_hash_destroy_ptrhash(vmm_alloc_hash); +} + +void * +malloc(unsigned long size, struct malloc_type *mtp, int flags) +{ + void *p; + struct kmem_item *i; + int kmem_flag = KM_SLEEP; + + if (flags & M_NOWAIT) + kmem_flag = KM_NOSLEEP; + + if (flags & M_ZERO) { + p = kmem_zalloc(size, kmem_flag); + } else { + p = kmem_alloc(size, kmem_flag); + } + + if (p == NULL) + return (NULL); + + i = kmem_zalloc(sizeof (struct kmem_item), kmem_flag); + + if (i == NULL) { + kmem_free(p, size); + return (NULL); + } + + mutex_enter(&kmem_items_lock); + i->addr = p; + i->size = size; + + VERIFY(mod_hash_insert(vmm_alloc_hash, + (mod_hash_key_t)PHYS_TO_DMAP(vtophys(p)), (mod_hash_val_t)i) == 0); + + mutex_exit(&kmem_items_lock); + + return (p); +} + +void +free(void *addr, struct malloc_type *mtp) +{ + mutex_enter(&kmem_items_lock); + VERIFY(mod_hash_destroy(vmm_alloc_hash, + (mod_hash_key_t)PHYS_TO_DMAP(vtophys(addr))) == 0); + mutex_exit(&kmem_items_lock); +} + +extern void *contig_alloc(size_t, ddi_dma_attr_t *, uintptr_t, int); +extern void contig_free(void *, size_t); + +void * +contigmalloc(unsigned long size, struct malloc_type *type, int flags, + vm_paddr_t low, vm_paddr_t high, unsigned long alignment, + vm_paddr_t boundary) +{ + ddi_dma_attr_t attr = { + /* Using fastboot_dma_attr as a guide... */ + DMA_ATTR_V0, + low, /* dma_attr_addr_lo */ + high, /* dma_attr_addr_hi */ + 0x00000000FFFFFFFFULL, /* dma_attr_count_max */ + alignment, /* dma_attr_align */ + 1, /* dma_attr_burstsize */ + 1, /* dma_attr_minxfer */ + 0x00000000FFFFFFFFULL, /* dma_attr_maxxfer */ + 0x00000000FFFFFFFFULL, /* dma_attr_seg: any */ + 1, /* dma_attr_sgllen */ + alignment, /* dma_attr_granular */ + 0, /* dma_attr_flags */ + }; + int cansleep = (flags & M_WAITOK); + void *result; + + ASSERT(alignment == PAGESIZE); + + result = contig_alloc((size_t)size, &attr, alignment, cansleep); + + if (result != NULL && (flags & M_ZERO) != 0) { + bzero(result, size); + } + return (result); +} + +void +contigfree(void *addr, unsigned long size, struct malloc_type *type) +{ + contig_free(addr, size); +} + +void +mtx_init(struct mtx *mtx, char *name, const char *type_name, int opts) +{ + /* + * Requests that a mutex be initialized to the MTX_SPIN type are + * ignored. The limitations which may have required spinlocks on + * FreeBSD do not apply to how bhyve has been structured here. + * + * Adaptive mutexes are required to avoid deadlocks when certain + * cyclics behavior interacts with interrupts and contended locks. + */ + mutex_init(&mtx->m, name, MUTEX_ADAPTIVE, NULL); +} + +void +mtx_destroy(struct mtx *mtx) +{ + mutex_destroy(&mtx->m); +} + +void +critical_enter(void) +{ + kpreempt_disable(); +} + +void +critical_exit(void) +{ + kpreempt_enable(); +} + + +static void +vmm_glue_callout_handler(void *arg) +{ + struct callout *c = arg; + + c->c_flags &= ~CALLOUT_PENDING; + if (c->c_flags & CALLOUT_ACTIVE) { + (c->c_func)(c->c_arg); + } +} + +void +vmm_glue_callout_init(struct callout *c, int mpsafe) +{ + cyc_handler_t hdlr; + cyc_time_t when; + + hdlr.cyh_level = CY_LOW_LEVEL; + hdlr.cyh_func = vmm_glue_callout_handler; + hdlr.cyh_arg = c; + when.cyt_when = CY_INFINITY; + when.cyt_interval = CY_INFINITY; + + mutex_enter(&cpu_lock); +#if 0 + /* + * XXXJOY: according to the freebsd sources, callouts do not begin + * their life in the ACTIVE state. + */ + c->c_flags |= CALLOUT_ACTIVE; +#else + bzero(c, sizeof (*c)); +#endif + c->c_cyc_id = cyclic_add(&hdlr, &when); + mutex_exit(&cpu_lock); +} + +static __inline hrtime_t +sbttohrtime(sbintime_t sbt) +{ + return (((sbt >> 32) * NANOSEC) + + (((uint64_t)NANOSEC * (uint32_t)sbt) >> 32)); +} + +int +vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr, + void (*func)(void *), void *arg, int flags) +{ + hrtime_t target = sbttohrtime(sbt); + + ASSERT(c->c_cyc_id != CYCLIC_NONE); + + c->c_func = func; + c->c_arg = arg; + c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); + + if (flags & C_ABSOLUTE) { + cyclic_reprogram(c->c_cyc_id, target); + } else { + cyclic_reprogram(c->c_cyc_id, target + gethrtime()); + } + + return (0); +} + +int +vmm_glue_callout_stop(struct callout *c) +{ + ASSERT(c->c_cyc_id != CYCLIC_NONE); + cyclic_reprogram(c->c_cyc_id, CY_INFINITY); + c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); + + return (0); +} + +int +vmm_glue_callout_drain(struct callout *c) +{ + ASSERT(c->c_cyc_id != CYCLIC_NONE); + mutex_enter(&cpu_lock); + cyclic_remove(c->c_cyc_id); + c->c_cyc_id = CYCLIC_NONE; + c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); + mutex_exit(&cpu_lock); + + return (0); +} + +void +vmm_glue_callout_localize(struct callout *c) +{ + mutex_enter(&cpu_lock); + cyclic_move_here(c->c_cyc_id); + mutex_exit(&cpu_lock); +} + +void +ipi_cpu(int cpu, u_int ipi) +{ + /* + * This was previously implemented as an invocation of asynchronous + * no-op crosscalls to interrupt the target CPU. Since even nowait + * crosscalls can block in certain circumstances, a direct poke_cpu() + * is safer when called from delicate contexts. + */ + poke_cpu(cpu); +} + +u_int cpu_high; /* Highest arg to CPUID */ +u_int cpu_exthigh; /* Highest arg to extended CPUID */ +u_int cpu_id; /* Stepping ID */ +char cpu_vendor[20]; /* CPU Origin code */ + +static void +vmm_cpuid_init(void) +{ + u_int regs[4]; + + do_cpuid(0, regs); + cpu_high = regs[0]; + ((u_int *)&cpu_vendor)[0] = regs[1]; + ((u_int *)&cpu_vendor)[1] = regs[3]; + ((u_int *)&cpu_vendor)[2] = regs[2]; + cpu_vendor[12] = '\0'; + + do_cpuid(1, regs); + cpu_id = regs[0]; + + do_cpuid(0x80000000, regs); + cpu_exthigh = regs[0]; +} + +/* + * FreeBSD uses the struct savefpu for managing the FPU state. That is mimicked + * by our hypervisor multiplexor framework structure. + */ +struct savefpu * +fpu_save_area_alloc(void) +{ + return ((struct savefpu *)hma_fpu_alloc(KM_SLEEP)); +} + +void +fpu_save_area_free(struct savefpu *fsa) +{ + hma_fpu_t *fpu = (hma_fpu_t *)fsa; + hma_fpu_free(fpu); +} + +void +fpu_save_area_reset(struct savefpu *fsa) +{ + hma_fpu_t *fpu = (hma_fpu_t *)fsa; + hma_fpu_init(fpu); +} + +/* + * This glue function is supposed to save the host's FPU state. This is always + * paired in the general bhyve code with a call to fpusave. Therefore, we treat + * this as a nop and do all the work in fpusave(), which will have the context + * argument that we want anyways. + */ +void +fpuexit(kthread_t *td) +{ +} + +/* + * This glue function is supposed to restore the guest's FPU state from the save + * area back to the host. In FreeBSD, it is assumed that the host state has + * already been saved by a call to fpuexit(); however, we do both here. + */ +void +fpurestore(void *arg) +{ + hma_fpu_t *fpu = arg; + + hma_fpu_start_guest(fpu); +} + +/* + * This glue function is supposed to save the guest's FPU state. The host's FPU + * state is not expected to be restored necessarily due to the use of FPU + * emulation through CR0.TS. However, we can and do restore it here. + */ +void +fpusave(void *arg) +{ + hma_fpu_t *fpu = arg; + + hma_fpu_stop_guest(fpu); +} + +void +vmm_sol_glue_init(void) +{ + vmm_alloc_init(); + vmm_cpuid_init(); +} + +void +vmm_sol_glue_cleanup(void) +{ + vmm_alloc_cleanup(); +} + + +/* From FreeBSD's sys/kern/subr_clock.c */ + +/*- + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1982, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: clock.c 1.18 91/01/21$ + * from: @(#)clock.c 8.2 (Berkeley) 1/12/94 + * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp + * and + * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04 + */ + +#include <sys/clock.h> + +/*--------------------------------------------------------------------* + * Generic routines to convert between a POSIX date + * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec + * Derived from NetBSD arch/hp300/hp300/clock.c + */ + +#define FEBRUARY 2 +#define days_in_year(y) (leapyear(y) ? 366 : 365) +#define days_in_month(y, m) \ + (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0)) +/* Day of week. Days are counted from 1/1/1970, which was a Thursday */ +#define day_of_week(days) (((days) + 4) % 7) + +static const int month_days[12] = { + 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 +}; + + +/* + * This inline avoids some unnecessary modulo operations + * as compared with the usual macro: + * ( ((year % 4) == 0 && + * (year % 100) != 0) || + * ((year % 400) == 0) ) + * It is otherwise equivalent. + */ +static int +leapyear(int year) +{ + int rv = 0; + + if ((year & 3) == 0) { + rv = 1; + if ((year % 100) == 0) { + rv = 0; + if ((year % 400) == 0) + rv = 1; + } + } + return (rv); +} + +int +clock_ct_to_ts(struct clocktime *ct, struct timespec *ts) +{ + int i, year, days; + + year = ct->year; + +#ifdef __FreeBSD__ + if (ct_debug) { + printf("ct_to_ts("); + print_ct(ct); + printf(")"); + } +#endif + + /* Sanity checks. */ + if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 || + ct->day > days_in_month(year, ct->mon) || + ct->hour > 23 || ct->min > 59 || ct->sec > 59 || + (sizeof(time_t) == 4 && year > 2037)) { /* time_t overflow */ +#ifdef __FreeBSD__ + if (ct_debug) + printf(" = EINVAL\n"); +#endif + return (EINVAL); + } + + /* + * Compute days since start of time + * First from years, then from months. + */ + days = 0; + for (i = POSIX_BASE_YEAR; i < year; i++) + days += days_in_year(i); + + /* Months */ + for (i = 1; i < ct->mon; i++) + days += days_in_month(year, i); + days += (ct->day - 1); + + ts->tv_sec = (((time_t)days * 24 + ct->hour) * 60 + ct->min) * 60 + + ct->sec; + ts->tv_nsec = ct->nsec; + +#ifdef __FreeBSD__ + if (ct_debug) + printf(" = %ld.%09ld\n", (long)ts->tv_sec, (long)ts->tv_nsec); +#endif + return (0); +} + +void +clock_ts_to_ct(struct timespec *ts, struct clocktime *ct) +{ + int i, year, days; + time_t rsec; /* remainder seconds */ + time_t secs; + + secs = ts->tv_sec; + days = secs / SECDAY; + rsec = secs % SECDAY; + + ct->dow = day_of_week(days); + + /* Subtract out whole years, counting them in i. */ + for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++) + days -= days_in_year(year); + ct->year = year; + + /* Subtract out whole months, counting them in i. */ + for (i = 1; days >= days_in_month(year, i); i++) + days -= days_in_month(year, i); + ct->mon = i; + + /* Days are what is left over (+1) from all that. */ + ct->day = days + 1; + + /* Hours, minutes, seconds are easy */ + ct->hour = rsec / 3600; + rsec = rsec % 3600; + ct->min = rsec / 60; + rsec = rsec % 60; + ct->sec = rsec; + ct->nsec = ts->tv_nsec; +#ifdef __FreeBSD__ + if (ct_debug) { + printf("ts_to_ct(%ld.%09ld) = ", + (long)ts->tv_sec, (long)ts->tv_nsec); + print_ct(ct); + printf("\n"); + } +#endif +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c new file mode 100644 index 0000000000..d630d32630 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c @@ -0,0 +1,297 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/machsystm.h> +#include <sys/x86_archext.h> + +#include <sys/gipt.h> +#include <vm/vm_glue.h> + + +struct rvi_map { + gipt_map_t rm_gipt; + uint64_t rm_wired_page_count; +}; +typedef struct rvi_map rvi_map_t; + +#define RVI_LOCK(m) (&(m)->rm_gipt.giptm_lock) + +#define RVI_MAX_LEVELS 4 + +CTASSERT(RVI_MAX_LEVELS <= GIPT_MAX_LEVELS); + +#define RVI_PRESENT PT_VALID +#define RVI_WRITABLE PT_WRITABLE +#define RVI_ACCESSED PT_REF +#define RVI_DIRTY PT_MOD +#define RVI_LGPG PT_PAGESIZE +#define RVI_NX PT_NX +#define RVI_USER PT_USER +#define RVI_PWT PT_WRITETHRU +#define RVI_PCD PT_NOCACHE + +#define RVI_PA_MASK PT_PADDR + +#define RVI_PAT(attr) rvi_attr_to_pat(attr) +#define RVI_PADDR(addr) ((addr) & RVI_PA_MASK) +#define RVI_PROT(prot) \ + ((((prot) & PROT_WRITE) != 0 ? RVI_WRITABLE : 0) | \ + (((prot) & PROT_EXEC) == 0 ? RVI_NX : 0)) + +#define RVI_IS_ABSENT(pte) (((pte) & RVI_PRESENT) == 0) +#define RVI_PTE_PFN(pte) mmu_btop(RVI_PADDR(pte)) +#define RVI_MAPS_PAGE(pte, lvl) \ + (!RVI_IS_ABSENT(pte) && (((pte) & RVI_LGPG) != 0 || (lvl) == 0)) +#define RVI_PTE_PROT(pte) \ + (RVI_IS_ABSENT(pte) ? 0 : ( \ + PROT_READ | \ + (((pte) & RVI_NX) == 0 ? PROT_EXEC : 0) | \ + (((pte) & RVI_WRITABLE) != 0 ? PROT_WRITE : 0))) + +#define RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \ + (RVI_PADDR(pfn_to_pa(pfn)) | \ + (((lvl) != 0) ? RVI_LGPG : 0) | \ + RVI_USER | RVI_ACCESSED | RVI_PRESENT | \ + RVI_PAT(attr) | \ + RVI_PROT(prot)) + +#define RVI_PTE_ASSIGN_TABLE(pfn) \ + (RVI_PADDR(pfn_to_pa(pfn)) | \ + RVI_USER | RVI_ACCESSED | RVI_PRESENT | \ + RVI_PAT(MTRR_TYPE_WB) | \ + RVI_PROT(PROT_READ | PROT_WRITE | PROT_EXEC)) + + +/* Make sure that PAT indexes line up as expected */ +CTASSERT((PAT_DEFAULT_ATTRIBUTE & 0xf) == MTRR_TYPE_WB); +CTASSERT(((PAT_DEFAULT_ATTRIBUTE >> 24) & 0xf) == MTRR_TYPE_UC); + +static inline uint64_t +rvi_attr_to_pat(const uint8_t attr) +{ + if (attr == MTRR_TYPE_UC) { + /* !PAT + PCD + PWT -> PAT3 -> MTRR_TYPE_UC */ + return (RVI_PCD|RVI_PWT); + } else if (attr == MTRR_TYPE_WB) { + /* !PAT + !PCD + !PWT -> PAT0 -> MTRR_TYPE_WB */ + return (0); + } + + panic("unexpected memattr %x", attr); + return (0); +} + +static gipt_pte_type_t +rvi_pte_type(uint64_t pte, uint_t level) +{ + if (RVI_IS_ABSENT(pte)) { + return (PTET_EMPTY); + } else if (RVI_MAPS_PAGE(pte, level)) { + return (PTET_PAGE); + } else { + return (PTET_LINK); + } +} + +static uint64_t +rvi_pte_map(uint64_t pfn) +{ + return (RVI_PTE_ASSIGN_TABLE(pfn)); +} + +static void * +rvi_create(uintptr_t *pml4_kaddr) +{ + rvi_map_t *rmap; + gipt_map_t *map; + gipt_t *root; + struct gipt_cbs cbs = { + .giptc_pte_type = rvi_pte_type, + .giptc_pte_map = rvi_pte_map, + }; + + rmap = kmem_zalloc(sizeof (*rmap), KM_SLEEP); + map = &rmap->rm_gipt; + root = gipt_alloc(); + root->gipt_level = RVI_MAX_LEVELS - 1; + gipt_map_init(map, RVI_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root); + + *pml4_kaddr = (uintptr_t)root->gipt_kva; + return (rmap); +} + +static void +rvi_destroy(void *arg) +{ + rvi_map_t *rmap = arg; + + if (rmap != NULL) { + gipt_map_t *map = &rmap->rm_gipt; + + gipt_map_fini(map); + kmem_free(rmap, sizeof (*rmap)); + } +} + +static uint64_t +rvi_wired_count(void *arg) +{ + rvi_map_t *rmap = arg; + uint64_t res; + + mutex_enter(RVI_LOCK(rmap)); + res = rmap->rm_wired_page_count; + mutex_exit(RVI_LOCK(rmap)); + + return (res); +} + +static int +rvi_is_wired(void *arg, uint64_t va, uint_t *protp) +{ + rvi_map_t *rmap = arg; + gipt_t *pt; + int rv = -1; + + mutex_enter(RVI_LOCK(rmap)); + pt = gipt_map_lookup_deepest(&rmap->rm_gipt, va); + if (pt != NULL) { + const uint64_t pte = GIPT_VA2PTE(pt, va); + + if (RVI_MAPS_PAGE(pte, pt->gipt_level)) { + *protp = RVI_PTE_PROT(pte); + rv = 0; + } + } + mutex_exit(RVI_LOCK(rmap)); + + return (rv); +} + +static int +rvi_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot, + uint8_t attr) +{ + rvi_map_t *rmap = arg; + gipt_map_t *map = &rmap->rm_gipt; + gipt_t *pt; + uint64_t *ptep, pte; + + ASSERT((prot & PROT_READ) != 0); + ASSERT3U((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)), ==, 0); + ASSERT3U(lvl, <, RVI_MAX_LEVELS); + + mutex_enter(RVI_LOCK(rmap)); + pt = gipt_map_lookup(map, va, lvl); + if (pt == NULL) { + /* + * A table at the appropriate VA/level that would house this + * mapping does not currently exist. Try to walk down to that + * point, creating any necessary parent(s). + */ + pt = gipt_map_create_parents(map, va, lvl); + + /* + * There was a large page mapping in the way of creating the + * necessary parent table(s). + */ + if (pt == NULL) { + panic("unexpected large page @ %08lx", va); + } + } + ptep = GIPT_VA2PTEP(pt, va); + + pte = *ptep; + if (!RVI_IS_ABSENT(pte)) { + if (!RVI_MAPS_PAGE(pte, lvl)) { + panic("unexpected PT link @ %08lx in %p", va, pt); + } else { + panic("unexpected page mapped @ %08lx in %p", va, pt); + } + } + + pte = RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr); + *ptep = pte; + pt->gipt_valid_cnt++; + rmap->rm_wired_page_count += gipt_level_count[lvl]; + + mutex_exit(RVI_LOCK(rmap)); + return (0); +} + +static uint64_t +rvi_unmap(void *arg, uint64_t va, uint64_t end_va) +{ + rvi_map_t *rmap = arg; + gipt_map_t *map = &rmap->rm_gipt; + gipt_t *pt; + uint64_t cur_va = va; + uint64_t unmapped = 0; + + mutex_enter(RVI_LOCK(rmap)); + + pt = gipt_map_lookup_deepest(map, cur_va); + if (pt == NULL) { + mutex_exit(RVI_LOCK(rmap)); + return (0); + } + if (!RVI_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) { + cur_va = gipt_map_next_page(map, cur_va, end_va, &pt); + if (cur_va == 0) { + mutex_exit(RVI_LOCK(rmap)); + return (0); + } + } + + while (cur_va < end_va) { + uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va); + const uint_t lvl = pt->gipt_level; + + ASSERT(RVI_MAPS_PAGE(*ptep, lvl)); + *ptep = 0; + pt->gipt_valid_cnt--; + unmapped += gipt_level_count[pt->gipt_level]; + + gipt_t *next_pt = pt; + uint64_t next_va; + next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt); + + if (pt->gipt_valid_cnt == 0) { + gipt_map_clean_parents(map, pt); + } + if (next_va == 0) { + break; + } + pt = next_pt; + cur_va = next_va; + } + rmap->rm_wired_page_count -= unmapped; + + mutex_exit(RVI_LOCK(rmap)); + + return (unmapped); +} + +struct vmm_pt_ops rvi_ops = { + .vpo_init = rvi_create, + .vpo_free = rvi_destroy, + .vpo_wired_cnt = rvi_wired_count, + .vpo_is_wired = rvi_is_wired, + .vpo_map = rvi_map, + .vpo_unmap = rvi_unmap, +}; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c new file mode 100644 index 0000000000..66a67d9529 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c @@ -0,0 +1,1016 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/list.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sysmacros.h> +#include <sys/machsystm.h> +#include <sys/vmsystm.h> +#include <sys/malloc.h> +#include <sys/x86_archext.h> +#include <vm/as.h> +#include <vm/seg_vn.h> +#include <vm/seg_kmem.h> +#include <vm/seg_vmm.h> + +#include <vm/vm_extern.h> +#include <vm/vm_map.h> +#include "vm/vm_glue.h" + +#define PMAP_TO_VMMAP(pm) ((vm_map_t) \ + ((caddr_t)(pm) - offsetof(struct vmspace, vms_pmap))) +#define VMMAP_TO_VMSPACE(vmmap) ((struct vmspace *) \ + ((caddr_t)(vmmap) - offsetof(struct vmspace, vm_map))) + + +struct vmspace_mapping { + list_node_t vmsm_node; + vm_object_t vmsm_object; + uintptr_t vmsm_addr; + size_t vmsm_len; + off_t vmsm_offset; + uint_t vmsm_prot; +}; +typedef struct vmspace_mapping vmspace_mapping_t; + +#define VMSM_OFFSET(vmsm, addr) ( \ + (vmsm)->vmsm_offset + \ + ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) + + +/* Private glue interfaces */ +static void pmap_free(pmap_t); +static vmspace_mapping_t *vm_mapping_find(struct vmspace *, uintptr_t, size_t, + boolean_t); +static void vm_mapping_remove(struct vmspace *, vmspace_mapping_t *); + +static vmem_t *vmm_alloc_arena = NULL; + +static void * +vmm_arena_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + return (segkmem_xalloc(vmp, NULL, size, vmflag, 0, + segkmem_page_create, &kvps[KV_VVP])); +} + +static void +vmm_arena_free(vmem_t *vmp, void *inaddr, size_t size) +{ + segkmem_xfree(vmp, inaddr, size, &kvps[KV_VVP], NULL); +} + +void +vmm_arena_init(void) +{ + vmm_alloc_arena = vmem_create("vmm_alloc_arena", NULL, 0, 1024 * 1024, + vmm_arena_alloc, vmm_arena_free, kvmm_arena, 0, VM_SLEEP); + + ASSERT(vmm_alloc_arena != NULL); +} + +void +vmm_arena_fini(void) +{ + VERIFY(vmem_size(vmm_alloc_arena, VMEM_ALLOC) == 0); + vmem_destroy(vmm_alloc_arena); + vmm_alloc_arena = NULL; +} + +struct vmspace * +vmspace_alloc(vm_offset_t start, vm_offset_t end, pmap_pinit_t pinit) +{ + struct vmspace *vms; + const uintptr_t size = end + 1; + + /* + * This whole mess is built on the assumption that a 64-bit address + * space is available to work with for the various pagetable tricks. + */ + VERIFY(ttoproc(curthread)->p_model == DATAMODEL_LP64); + VERIFY(start == 0 && size > 0 && (size & PAGEOFFSET) == 0 && + size <= (uintptr_t)USERLIMIT); + + vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); + vms->vms_size = size; + list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), + offsetof(vmspace_mapping_t, vmsm_node)); + + if (pinit(&vms->vms_pmap) == 0) { + kmem_free(vms, sizeof (*vms)); + return (NULL); + } + + return (vms); +} + +void +vmspace_free(struct vmspace *vms) +{ + VERIFY(list_is_empty(&vms->vms_maplist)); + + pmap_free(&vms->vms_pmap); + kmem_free(vms, sizeof (*vms)); +} + +pmap_t +vmspace_pmap(struct vmspace *vms) +{ + return (&vms->vms_pmap); +} + +long +vmspace_resident_count(struct vmspace *vms) +{ + /* XXXJOY: finish */ + return (0); +} + +void * +vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size) +{ + vmspace_mapping_t *vmsm; + void *result = NULL; + + /* + * Since vmspace_find_kva is provided so that vmm_drv consumers can do + * GPA2KVA translations, it is expected to be called when there is a + * read lock preventing vmspace alterations. As such, it can do the + * lockless vm_mapping_find() lookup. + */ + vmsm = vm_mapping_find(vms, addr, size, B_TRUE); + if (vmsm != NULL) { + struct vm_object *vmo = vmsm->vmsm_object; + + switch (vmo->vmo_type) { + case OBJT_DEFAULT: + result = (void *)((uintptr_t)vmo->vmo_data + + VMSM_OFFSET(vmsm, addr)); + break; + default: + break; + } + } + + return (result); +} + +static int +vmspace_pmap_iswired(struct vmspace *vms, uintptr_t addr, uint_t *prot) +{ + pmap_t pmap = &vms->vms_pmap; + int rv; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + + rv = pmap->pm_ops->vpo_is_wired(pmap->pm_impl, addr, prot); + return (rv); +} + +static void +pmap_free(pmap_t pmap) +{ + void *pmi = pmap->pm_impl; + struct vmm_pt_ops *ops = pmap->pm_ops; + + pmap->pm_pml4 = NULL; + pmap->pm_impl = NULL; + pmap->pm_ops = NULL; + + ops->vpo_free(pmi); +} + +int +pmap_pinit_type(pmap_t pmap, enum pmap_type type, int flags) +{ + /* For use in vmm only */ + pmap->pm_type = type; + switch (type) { + case PT_EPT: { + struct vmm_pt_ops *ops = &ept_ops; + void *pml4, *pmi; + + pmi = ops->vpo_init((uintptr_t *)&pml4); + + pmap->pm_ops = ops; + pmap->pm_impl = pmi; + pmap->pm_pml4 = pml4; + return (1); + } + case PT_RVI: { + struct vmm_pt_ops *ops = &rvi_ops; + void *pml4, *pmi; + + pmi = ops->vpo_init((uintptr_t *)&pml4); + + pmap->pm_ops = ops; + pmap->pm_impl = pmi; + pmap->pm_pml4 = pml4; + return (1); + } + default: + panic("unsupported pmap type: %x", type); + break; + } + + return (1); +} + +long +pmap_wired_count(pmap_t pmap) +{ + long val; + + val = pmap->pm_ops->vpo_wired_cnt(pmap->pm_impl); + VERIFY3S(val, >=, 0); + + return (val); +} + +int +pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) +{ + /* Allow the fallback to vm_fault to handle this */ + return (-1); +} + + + +struct sglist_ent { + vm_paddr_t sge_pa; + size_t sge_len; +}; +struct sglist { + kmutex_t sg_lock; + uint_t sg_refcnt; + uint_t sg_len; + uint_t sg_next; + struct sglist_ent sg_entries[]; +}; + +#define SG_SIZE(cnt) (sizeof (struct sglist) + \ + (sizeof (struct sglist_ent) * (cnt))) + +struct sglist * +sglist_alloc(int nseg, int flags) +{ + const size_t sz = SG_SIZE(nseg); + const int flag = (flags & M_WAITOK) ? KM_SLEEP : KM_NOSLEEP; + struct sglist *sg; + + ASSERT(nseg > 0); + + sg = kmem_zalloc(sz, flag); + if (sg != NULL) { + sg->sg_len = nseg; + sg->sg_refcnt = 1; + } + return (sg); +} + +void +sglist_free(struct sglist *sg) +{ + size_t sz; + + mutex_enter(&sg->sg_lock); + if (sg->sg_refcnt > 1) { + sg->sg_refcnt--; + mutex_exit(&sg->sg_lock); + return; + } + + VERIFY(sg->sg_refcnt == 1); + sg->sg_refcnt = 0; + sz = SG_SIZE(sg->sg_len); + mutex_exit(&sg->sg_lock); + kmem_free(sg, sz); +} + +int +sglist_append_phys(struct sglist *sg, vm_paddr_t pa, size_t len) +{ + uint_t idx; + struct sglist_ent *ent; + + /* Restrict to page-aligned entries */ + if ((pa & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0 || len == 0) { + return (EINVAL); + } + + mutex_enter(&sg->sg_lock); + idx = sg->sg_next; + if (idx >= sg->sg_len) { + mutex_exit(&sg->sg_lock); + return (ENOSPC); + } + + ent = &sg->sg_entries[idx]; + ASSERT(ent->sge_pa == 0 && ent->sge_len == 0); + ent->sge_pa = pa; + ent->sge_len = len; + sg->sg_next++; + + mutex_exit(&sg->sg_lock); + return (0); +} + + +static pfn_t +vm_object_pager_none(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) +{ + panic("bad vm_object pager"); + return (PFN_INVALID); +} + +static pfn_t +vm_object_pager_heap(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) +{ + const uintptr_t kaddr = ALIGN2PAGE((uintptr_t)vmo->vmo_data + off); + uint_t idx, level; + htable_t *ht; + x86pte_t pte; + pfn_t top_pfn, pfn; + + ASSERT(vmo->vmo_type == OBJT_DEFAULT); + ASSERT(off < vmo->vmo_size); + + ht = htable_getpage(kas.a_hat, kaddr, &idx); + if (ht == NULL) { + return (PFN_INVALID); + } + pte = x86pte_get(ht, idx); + if (!PTE_ISPAGE(pte, ht->ht_level)) { + htable_release(ht); + return (PFN_INVALID); + } + + pfn = top_pfn = PTE2PFN(pte, ht->ht_level); + level = ht->ht_level; + if (ht->ht_level > 0) { + pfn += mmu_btop(kaddr & LEVEL_OFFSET((uint_t)ht->ht_level)); + } + htable_release(ht); + + if (lpfn != NULL) { + *lpfn = top_pfn; + } + if (lvl != NULL) { + *lvl = level; + } + return (pfn); +} + +static pfn_t +vm_object_pager_sg(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) +{ + const uintptr_t aoff = ALIGN2PAGE(off); + uint_t level = 0; + uintptr_t pos = 0; + struct sglist *sg; + struct sglist_ent *ent; + pfn_t pfn = PFN_INVALID; + + ASSERT(vmo->vmo_type == OBJT_SG); + ASSERT(off < vmo->vmo_size); + + sg = vmo->vmo_data; + if (sg == NULL) { + return (PFN_INVALID); + } + + ent = &sg->sg_entries[0]; + for (uint_t i = 0; i < sg->sg_next; i++, ent++) { + if (aoff >= pos && aoff < (pos + ent->sge_len)) { + /* XXXJOY: Punt on large pages for now */ + level = 0; + pfn = mmu_btop(ent->sge_pa + (aoff - pos)); + break; + } + pos += ent->sge_len; + } + + if (lpfn != 0) { + *lpfn = pfn; + } + if (lvl != 0) { + *lvl = level; + } + return (pfn); +} + +static void +vm_reserve_pages(size_t npages) +{ + uint_t retries = 60; + int rc; + + mutex_enter(&freemem_lock); + if (availrmem < npages) { + mutex_exit(&freemem_lock); + + /* + * Set needfree and wait for the ZFS ARC reap thread to free up + * some memory. + */ + page_needfree(npages); + + mutex_enter(&freemem_lock); + while ((availrmem < npages) && retries-- > 0) { + mutex_exit(&freemem_lock); + rc = delay_sig(drv_usectohz(1 * MICROSEC)); + mutex_enter(&freemem_lock); + + if (rc == EINTR) + break; + } + mutex_exit(&freemem_lock); + + page_needfree(-npages); + } else { + mutex_exit(&freemem_lock); + } +} + +void +vm_object_clear(vm_object_t vmo) +{ + ASSERT(vmo->vmo_type == OBJT_DEFAULT); + + /* XXXJOY: Better zeroing approach? */ + bzero(vmo->vmo_data, vmo->vmo_size); +} + +vm_object_t +vm_object_allocate(objtype_t type, vm_pindex_t psize) +{ + vm_object_t vmo; + const size_t size = ptob((size_t)psize); + + vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); + mutex_init(&vmo->vmo_lock, NULL, MUTEX_DEFAULT, NULL); + + /* For now, these are to stay fixed after allocation */ + vmo->vmo_type = type; + vmo->vmo_size = size; + vmo->vmo_attr = VM_MEMATTR_DEFAULT; + + switch (type) { + case OBJT_DEFAULT: { + vm_reserve_pages(psize); + + /* XXXJOY: opt-in to larger pages? */ + vmo->vmo_data = vmem_alloc(vmm_alloc_arena, size, KM_NOSLEEP); + if (vmo->vmo_data == NULL) { + mutex_destroy(&vmo->vmo_lock); + kmem_free(vmo, sizeof (*vmo)); + return (NULL); + } + vm_object_clear(vmo); + vmo->vmo_pager = vm_object_pager_heap; + } + break; + case OBJT_SG: + vmo->vmo_data = NULL; + vmo->vmo_pager = vm_object_pager_sg; + break; + default: + panic("Unsupported vm_object type"); + break; + } + + vmo->vmo_refcnt = 1; + return (vmo); +} + +vm_object_t +vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, + vm_prot_t prot, vm_ooffset_t off, void *cred) +{ + struct vm_object *vmo; + struct sglist *sg = (struct sglist *)handle; + + /* XXXJOY: be very restrictive for now */ + VERIFY(type == OBJT_SG); + VERIFY(off == 0); + + vmo = vm_object_allocate(type, size); + vmo->vmo_data = sg; + + mutex_enter(&sg->sg_lock); + VERIFY(sg->sg_refcnt++ >= 1); + mutex_exit(&sg->sg_lock); + + return (vmo); +} + +void +vm_object_deallocate(vm_object_t vmo) +{ + ASSERT(vmo != NULL); + + uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); + /* underflow would be a deadly serious mistake */ + VERIFY3U(ref, !=, UINT_MAX); + if (ref != 0) { + return; + } + + switch (vmo->vmo_type) { + case OBJT_DEFAULT: + vmem_free(vmm_alloc_arena, vmo->vmo_data, vmo->vmo_size); + break; + case OBJT_SG: + sglist_free((struct sglist *)vmo->vmo_data); + break; + default: + panic("Unsupported vm_object type"); + break; + } + + vmo->vmo_pager = vm_object_pager_none; + vmo->vmo_data = NULL; + vmo->vmo_size = 0; + mutex_destroy(&vmo->vmo_lock); + kmem_free(vmo, sizeof (*vmo)); +} + +CTASSERT(VM_MEMATTR_UNCACHEABLE == MTRR_TYPE_UC); +CTASSERT(VM_MEMATTR_WRITE_BACK == MTRR_TYPE_WB); +int +vm_object_set_memattr(vm_object_t vmo, vm_memattr_t attr) +{ + ASSERT(MUTEX_HELD(&vmo->vmo_lock)); + + switch (attr) { + case VM_MEMATTR_UNCACHEABLE: + case VM_MEMATTR_WRITE_BACK: + vmo->vmo_attr = attr; + return (0); + default: + break; + } + return (EINVAL); +} + +void +vm_object_reference(vm_object_t vmo) +{ + ASSERT(vmo != NULL); + + uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); + /* overflow would be a deadly serious mistake */ + VERIFY3U(ref, !=, 0); +} + +static vmspace_mapping_t * +vm_mapping_find(struct vmspace *vms, uintptr_t addr, size_t size, + boolean_t no_lock) +{ + vmspace_mapping_t *vmsm; + list_t *ml = &vms->vms_maplist; + const uintptr_t range_end = addr + size; + + ASSERT(addr <= range_end); + + if (no_lock) { + /* + * This check should be superflous with the protections + * promised by the bhyve logic which calls into the VM shim. + * All the same, it is cheap to be paranoid. + */ + VERIFY(!vms->vms_map_changing); + } else { + VERIFY(MUTEX_HELD(&vms->vms_lock)); + } + + if (addr >= vms->vms_size) { + return (NULL); + } + for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { + const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; + + if (addr >= vmsm->vmsm_addr && addr < seg_end) { + if (range_end <= seg_end) { + return (vmsm); + } else { + return (NULL); + } + } + } + return (NULL); +} + +static boolean_t +vm_mapping_gap(struct vmspace *vms, uintptr_t addr, size_t size) +{ + vmspace_mapping_t *vmsm; + list_t *ml = &vms->vms_maplist; + const uintptr_t range_end = addr + size; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + + for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { + const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; + + if ((vmsm->vmsm_addr >= addr && vmsm->vmsm_addr < range_end) || + (seg_end > addr && seg_end < range_end)) { + return (B_FALSE); + } + } + return (B_TRUE); +} + +static void +vm_mapping_remove(struct vmspace *vms, vmspace_mapping_t *vmsm) +{ + list_t *ml = &vms->vms_maplist; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + ASSERT(vms->vms_map_changing); + + list_remove(ml, vmsm); + vm_object_deallocate(vmsm->vmsm_object); + kmem_free(vmsm, sizeof (*vmsm)); +} + +int +vm_fault(vm_map_t map, vm_offset_t off, vm_prot_t type, int flag) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + pmap_t pmap = &vms->vms_pmap; + void *pmi = pmap->pm_impl; + const uintptr_t addr = off; + vmspace_mapping_t *vmsm; + struct vm_object *vmo; + uint_t prot, map_lvl; + pfn_t pfn; + uintptr_t map_addr; + + mutex_enter(&vms->vms_lock); + if (vmspace_pmap_iswired(vms, addr, &prot) == 0) { + int err = 0; + + /* + * It is possible that multiple vCPUs will race to fault-in a + * given address. In such cases, the race loser(s) will + * encounter the already-mapped page, needing to do nothing + * more than consider it a success. + * + * If the fault exceeds protection, it is an obvious error. + */ + if ((prot & type) != type) { + err = FC_PROT; + } + + mutex_exit(&vms->vms_lock); + return (err); + } + + /* Try to wire up the address */ + if ((vmsm = vm_mapping_find(vms, addr, 0, B_FALSE)) == NULL) { + mutex_exit(&vms->vms_lock); + return (FC_NOMAP); + } + vmo = vmsm->vmsm_object; + prot = vmsm->vmsm_prot; + + /* XXXJOY: punt on large pages for now */ + pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, addr), NULL, NULL); + map_lvl = 0; + map_addr = P2ALIGN((uintptr_t)addr, LEVEL_SIZE(map_lvl)); + VERIFY(pfn != PFN_INVALID); + + /* + * If pmap failure is to be handled, the previously acquired page locks + * would need to be released. + */ + VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, prot, + vmo->vmo_attr)); + pmap->pm_eptgen++; + + mutex_exit(&vms->vms_lock); + return (0); +} + +int +vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, + vm_prot_t prot, vm_page_t *ma, int max_count) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + const uintptr_t vaddr = addr; + vmspace_mapping_t *vmsm; + struct vm_object *vmo; + vm_page_t vmp; + + ASSERT0(addr & PAGEOFFSET); + ASSERT(len == PAGESIZE); + ASSERT(max_count == 1); + + /* + * Unlike practically all of the other logic that queries or + * manipulates vmspace objects, vm_fault_quick_hold_pages() does so + * without holding vms_lock. This is safe because bhyve ensures that + * changes to the vmspace map occur only when all other threads have + * been excluded from running. + * + * Since this task can count on vms_maplist remaining static and does + * not need to modify the pmap (like vm_fault might), it can proceed + * without the lock. The vm_object has independent refcount and lock + * protection, while the vmo_pager methods do not rely on vms_lock for + * safety. + * + * Performing this work without locks is critical in cases where + * multiple vCPUs require simultaneous instruction emulation, such as + * for frequent guest APIC accesses on a host that lacks hardware + * acceleration for that behavior. + */ + if ((vmsm = vm_mapping_find(vms, vaddr, PAGESIZE, B_TRUE)) == NULL || + (prot & ~vmsm->vmsm_prot) != 0) { + return (-1); + } + + vmp = kmem_zalloc(sizeof (struct vm_page), KM_SLEEP); + + vmo = vmsm->vmsm_object; + vm_object_reference(vmo); + vmp->vmp_obj_held = vmo; + vmp->vmp_pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, vaddr), NULL, + NULL); + + *ma = vmp; + return (1); +} + +/* + * Find a suitable location for a mapping (and install it). + */ +int +vm_map_find(vm_map_t map, vm_object_t vmo, vm_ooffset_t off, vm_offset_t *addr, + vm_size_t len, vm_offset_t max_addr, int find_flags, vm_prot_t prot, + vm_prot_t prot_max, int cow) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + const size_t size = (size_t)len; + const uintptr_t uoff = (uintptr_t)off; + uintptr_t base = *addr; + vmspace_mapping_t *vmsm; + int res = 0; + + /* For use in vmm only */ + VERIFY(find_flags == VMFS_NO_SPACE); /* essentially MAP_FIXED */ + VERIFY(max_addr == 0); + + if (size == 0 || off < 0 || + uoff >= (uoff + size) || vmo->vmo_size < (uoff + size)) { + return (EINVAL); + } + + if (*addr >= vms->vms_size) { + return (ENOMEM); + } + + vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); + + mutex_enter(&vms->vms_lock); + vms->vms_map_changing = B_TRUE; + if (!vm_mapping_gap(vms, base, size)) { + res = ENOMEM; + goto out; + } + + if (res == 0) { + vmsm->vmsm_object = vmo; + vmsm->vmsm_addr = base; + vmsm->vmsm_len = len; + vmsm->vmsm_offset = (off_t)uoff; + vmsm->vmsm_prot = prot; + list_insert_tail(&vms->vms_maplist, vmsm); + + /* Communicate out the chosen address. */ + *addr = (vm_offset_t)base; + } +out: + vms->vms_map_changing = B_FALSE; + mutex_exit(&vms->vms_lock); + if (res != 0) { + kmem_free(vmsm, sizeof (*vmsm)); + } + return (res); +} + +int +vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + pmap_t pmap = &vms->vms_pmap; + void *pmi = pmap->pm_impl; + const uintptr_t addr = start; + const size_t size = (size_t)(end - start); + vmspace_mapping_t *vmsm; + + ASSERT(start < end); + + mutex_enter(&vms->vms_lock); + vms->vms_map_changing = B_TRUE; + /* expect to match existing mapping exactly */ + if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL || + vmsm->vmsm_addr != addr || vmsm->vmsm_len != size) { + vms->vms_map_changing = B_FALSE; + mutex_exit(&vms->vms_lock); + return (ENOENT); + } + + (void) pmap->pm_ops->vpo_unmap(pmi, addr, end); + pmap->pm_eptgen++; + + vm_mapping_remove(vms, vmsm); + vms->vms_map_changing = B_FALSE; + mutex_exit(&vms->vms_lock); + return (0); +} + +int +vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + pmap_t pmap = &vms->vms_pmap; + void *pmi = pmap->pm_impl; + const uintptr_t addr = start; + const size_t size = end - start; + vmspace_mapping_t *vmsm; + struct vm_object *vmo; + uint_t prot; + + mutex_enter(&vms->vms_lock); + + /* For the time being, only exact-match mappings are expected */ + if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) { + mutex_exit(&vms->vms_lock); + return (FC_NOMAP); + } + vmo = vmsm->vmsm_object; + prot = vmsm->vmsm_prot; + + for (uintptr_t pos = addr; pos < end; ) { + pfn_t pfn; + uintptr_t pg_size, map_addr; + uint_t map_lvl = 0; + + /* XXXJOY: punt on large pages for now */ + pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, pos), NULL, NULL); + pg_size = LEVEL_SIZE(map_lvl); + map_addr = P2ALIGN(pos, pg_size); + VERIFY(pfn != PFN_INVALID); + + VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, + prot, vmo->vmo_attr)); + vms->vms_pmap.pm_eptgen++; + + pos += pg_size; + } + + mutex_exit(&vms->vms_lock); + + return (0); +} + +/* Provided custom for bhyve 'devmem' segment mapping */ +int +vm_segmap_obj(struct vmspace *vms, vm_object_t vmo, struct as *as, + caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) +{ + const size_t size = vmo->vmo_size; + int err; + + if (vmo->vmo_type != OBJT_DEFAULT) { + /* Only support default objects for now */ + return (ENOTSUP); + } + + as_rangelock(as); + + err = choose_addr(as, addrp, size, 0, ADDR_VACALIGN, flags); + if (err == 0) { + segvmm_crargs_t svma; + + svma.kaddr = vmo->vmo_data; + svma.prot = prot; + svma.cookie = vmo; + svma.hold = (segvmm_holdfn_t)vm_object_reference; + svma.rele = (segvmm_relefn_t)vm_object_deallocate; + + err = as_map(as, *addrp, size, segvmm_create, &svma); + } + + as_rangeunlock(as); + return (err); +} + +int +vm_segmap_space(struct vmspace *vms, off_t off, struct as *as, caddr_t *addrp, + off_t len, uint_t prot, uint_t maxprot, uint_t flags) +{ + const uintptr_t addr = (uintptr_t)off; + const size_t size = (uintptr_t)len; + vmspace_mapping_t *vmsm; + vm_object_t vmo; + int err; + + if (off < 0 || len <= 0 || + (addr & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { + return (EINVAL); + } + + mutex_enter(&vms->vms_lock); + if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) { + mutex_exit(&vms->vms_lock); + return (ENXIO); + } + if ((prot & ~(vmsm->vmsm_prot | PROT_USER)) != 0) { + mutex_exit(&vms->vms_lock); + return (EACCES); + } + vmo = vmsm->vmsm_object; + if (vmo->vmo_type != OBJT_DEFAULT) { + /* Only support default objects for now */ + mutex_exit(&vms->vms_lock); + return (ENOTSUP); + } + + as_rangelock(as); + + err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); + if (err == 0) { + segvmm_crargs_t svma; + const uintptr_t addroff = addr - vmsm->vmsm_addr; + const uintptr_t mapoff = addroff + vmsm->vmsm_offset; + + VERIFY(addroff < vmsm->vmsm_len); + VERIFY((vmsm->vmsm_len - addroff) >= size); + VERIFY(mapoff < vmo->vmo_size); + VERIFY((mapoff + size) <= vmo->vmo_size); + + svma.kaddr = (void *)((uintptr_t)vmo->vmo_data + mapoff); + svma.prot = prot; + svma.cookie = vmo; + svma.hold = (segvmm_holdfn_t)vm_object_reference; + svma.rele = (segvmm_relefn_t)vm_object_deallocate; + + err = as_map(as, *addrp, len, segvmm_create, &svma); + } + + as_rangeunlock(as); + mutex_exit(&vms->vms_lock); + return (err); +} + +void +vm_page_lock(vm_page_t vmp) +{ + ASSERT(!MUTEX_HELD(&vmp->vmp_lock)); + + mutex_enter(&vmp->vmp_lock); +} + +void +vm_page_unlock(vm_page_t vmp) +{ + boolean_t purge = (vmp->vmp_pfn == PFN_INVALID); + + ASSERT(MUTEX_HELD(&vmp->vmp_lock)); + + mutex_exit(&vmp->vmp_lock); + + if (purge) { + mutex_destroy(&vmp->vmp_lock); + kmem_free(vmp, sizeof (*vmp)); + } +} + +void +vm_page_unhold(vm_page_t vmp) +{ + ASSERT(MUTEX_HELD(&vmp->vmp_lock)); + VERIFY(vmp->vmp_pfn != PFN_INVALID); + + vm_object_deallocate(vmp->vmp_obj_held); + vmp->vmp_obj_held = NULL; + vmp->vmp_pfn = PFN_INVALID; +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.c b/usr/src/uts/i86pc/io/vmm/vmm_stat.c new file mode 100644 index 0000000000..2cbcce9590 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.c @@ -0,0 +1,172 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <machine/vmm.h> +#include "vmm_util.h" +#include "vmm_stat.h" + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst->scope == VMM_STAT_SCOPE_INTEL && !vmm_is_intel()) + return; + + if (vst->scope == VMM_STAT_SCOPE_AMD && !vmm_is_amd()) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accommodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i; + + if (vcpu < 0 || vcpu >= vm_get_maxcpus(vm)) + return (EINVAL); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vm, vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vst_num_elems; i++) + buf[i] = stats[i]; + *num_stats = vst_num_elems; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +int +vmm_stat_desc_copy(int index, char *buf, int bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); +VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); +VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); +VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); +VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); +VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); +VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); +VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); +VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); +VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); +VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); +VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); +VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); +VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); +VMM_STAT(VMEXIT_RUNBLOCK, "number of times runblock at exit"); +VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.h b/usr/src/uts/i86pc/io/vmm/vmm_stat.h new file mode 100644 index 0000000000..3232e23888 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h @@ -0,0 +1,172 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +#include <machine/vmm.h> + +struct vm; + +#ifdef __FreeBSD__ +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ +#else +#define MAX_VMM_STAT_ELEMS (64 + VM_MAXCPU) /* arbitrary */ +#endif + +enum vmm_stat_scope { + VMM_STAT_SCOPE_ANY, + VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */ + VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ +}; + +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu, + struct vmm_stat_type *stat); + +struct vmm_stat_type { + int index; /* position in the stats buffer */ + int nelems; /* standalone or array */ + const char *desc; /* description of statistic */ + vmm_stat_func_t func; + enum vmm_stat_scope scope; +}; + +void vmm_stat_register(void *arg); + +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ + struct vmm_stat_type type[1] = { \ + { -1, nelems, desc, func, scope } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) + +#define VMM_STAT_DECLARE(type) \ + extern struct vmm_stat_type type[1] + +#define VMM_STAT(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) +#define VMM_STAT_INTEL(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL) +#define VMM_STAT_AMD(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD) + +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_ARRAY(type, nelems, desc) \ + VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) + +void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); +void vmm_stat_free(void *vp); + +/* + * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries + */ +int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); +int vmm_stat_desc_copy(int index, char *buf, int buflen); + +static __inline void +vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] += x; +#endif +} + +static __inline void +vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t val) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#endif +} + +static __inline void +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_incr(vm, vcpu, vst, 0, x); +#endif +} + +static __inline void +vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_set(vm, vcpu, vst, 0, val); +#endif +} + +VMM_STAT_DECLARE(VCPU_MIGRATIONS); +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_EXTINT); +VMM_STAT_DECLARE(VMEXIT_HLT); +VMM_STAT_DECLARE(VMEXIT_CR_ACCESS); +VMM_STAT_DECLARE(VMEXIT_RDMSR); +VMM_STAT_DECLARE(VMEXIT_WRMSR); +VMM_STAT_DECLARE(VMEXIT_MTRAP); +VMM_STAT_DECLARE(VMEXIT_PAUSE); +VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); +VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); +VMM_STAT_DECLARE(VMEXIT_INOUT); +VMM_STAT_DECLARE(VMEXIT_CPUID); +VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); +VMM_STAT_DECLARE(VMEXIT_INST_EMUL); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_ASTPENDING); +VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RUNBLOCK); +VMM_STAT_DECLARE(VMEXIT_EXCEPTION); +VMM_STAT_DECLARE(VMEXIT_REQIDLE); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_support.s b/usr/src/uts/i86pc/io/vmm/vmm_support.s new file mode 100644 index 0000000000..5777d46959 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_support.s @@ -0,0 +1,54 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> +#include <sys/segments.h> + +/* + * %rdi = trapno + * + * This variant is for any explicit exception injection that we need: in this + * case, we can't just, for example, do a direct "int $2", as that will then + * trash our %cr3 via tr_nmiint due to KPTI, so we have to fake a trap frame. + * Both NMIs and MCEs don't push an 'err' into the frame. + */ +ENTRY_NP(vmm_call_trap) + pushq %rbp + movq %rsp, %rbp + movq %rsp, %r11 + andq $~0xf, %rsp /* align stack */ + pushq $KDS_SEL /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KCS_SEL /* %cs */ + leaq .trap_iret_dest(%rip), %rcx + pushq %rcx /* %rip */ + cli + cmpq $T_NMIFLT, %rdi + je nmiint + cmpq $T_MCE, %rdi + je mcetrap + + pushq %rdi /* save our bad trapno... */ + leaq __vmm_call_bad_trap(%rip), %rdi + xorl %eax, %eax + call panic + /*NOTREACHED*/ + +.trap_iret_dest: + popq %rbp + ret +SET_SIZE(vmm_call_trap) + +__vmm_call_bad_trap: + .string "bad trapno for vmm_call_trap()" diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.c b/usr/src/uts/i86pc/io/vmm/vmm_util.c new file mode 100644 index 0000000000..3eadfe57e5 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_util.c @@ -0,0 +1,127 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/libkern.h> + +#include <machine/md_var.h> + +#include "vmm_util.h" + +boolean_t +vmm_is_intel(void) +{ + + if (strcmp(cpu_vendor, "GenuineIntel") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_is_amd(void) +{ + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_supports_1G_pages(void) +{ + unsigned int regs[4]; + + /* + * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages + * + * Both Intel and AMD support this bit. + */ + if (cpu_exthigh >= 0x80000001) { + do_cpuid(0x80000001, regs); + if (regs[3] & (1 << 26)) + return (TRUE); + } + return (FALSE); +} + +#ifdef __FreeBSD__ +#include <sys/proc.h> +#include <machine/frame.h> +#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x)) +#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x)) +void +dump_trapframe(struct trapframe *tf) +{ + DUMP_REG(rdi); + DUMP_REG(rsi); + DUMP_REG(rdx); + DUMP_REG(rcx); + DUMP_REG(r8); + DUMP_REG(r9); + DUMP_REG(rax); + DUMP_REG(rbx); + DUMP_REG(rbp); + DUMP_REG(r10); + DUMP_REG(r11); + DUMP_REG(r12); + DUMP_REG(r13); + DUMP_REG(r14); + DUMP_REG(r15); + DUMP_REG(trapno); + DUMP_REG(addr); + DUMP_REG(flags); + DUMP_REG(err); + DUMP_REG(rip); + DUMP_REG(rflags); + DUMP_REG(rsp); + DUMP_SEG(cs); + DUMP_SEG(ss); + DUMP_SEG(fs); + DUMP_SEG(gs); + DUMP_SEG(es); + DUMP_SEG(ds); +} +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.h b/usr/src/uts/i86pc/io/vmm/vmm_util.h new file mode 100644 index 0000000000..fc7e7364c7 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_util.h @@ -0,0 +1,42 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_UTIL_H_ +#define _VMM_UTIL_H_ + +struct trapframe; + +boolean_t vmm_is_intel(void); +boolean_t vmm_is_amd(void); +boolean_t vmm_supports_1G_pages(void); + +void dump_trapframe(struct trapframe *tf); + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_zsd.c b/usr/src/uts/i86pc/io/vmm/vmm_zsd.c new file mode 100644 index 0000000000..0271cc339e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_zsd.c @@ -0,0 +1,218 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#include <sys/cpuvar.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/list.h> +#include <sys/types.h> +#include <sys/vmm.h> +#include <sys/vmm_impl.h> +#include <sys/zone.h> + +/* + * zone specific data + * + * Zone specific data is used to keep an association between zones and the vmm + * instances that may be running in them. This is used to ensure that vmm + * instances do not outlive their parent zone. + * + * Locking strategy + * + * The global vmm_zsd_lock is held while modifying vmm_zsd_list. + * + * The per zone vz_lock in vmm_zsd_t is held while reading or writing anything + * within in vmm_zsd_t instance. This is important to ensure that there's not + * an accidental VM creating as a zone is going down. + */ + +/* + * One of these per zone. + */ +struct vmm_zsd { + list_t vz_vmms; /* vmm instances in the zone */ + list_node_t vz_linkage; /* link to other zones */ + boolean_t vz_active; /* B_FALSE early in shutdown callback */ + zoneid_t vz_zoneid; + kmutex_t vz_lock; +}; + +static kmutex_t vmm_zsd_lock; /* Protects vmm_zsd_list */ +static list_t vmm_zsd_list; /* Linkage between all zsd instances */ + +static zone_key_t vmm_zsd_key; + +int +vmm_zsd_add_vm(vmm_softc_t *sc) +{ + vmm_zsd_t *zsd; + + ASSERT(sc->vmm_zone != NULL); + + mutex_enter(&vmm_zsd_lock); + + for (zsd = list_head(&vmm_zsd_list); zsd != NULL; + zsd = list_next(&vmm_zsd_list, zsd)) { + if (zsd->vz_zoneid == sc->vmm_zone->zone_id) { + break; + } + } + + VERIFY(zsd != NULL); + mutex_exit(&vmm_zsd_lock); + + mutex_enter(&zsd->vz_lock); + if (!zsd->vz_active) { + mutex_exit(&zsd->vz_lock); + return (ENOSYS); + } + + sc->vmm_zsd = zsd; + list_insert_tail(&zsd->vz_vmms, sc); + + mutex_exit(&zsd->vz_lock); + + return (0); +} + +void +vmm_zsd_rem_vm(vmm_softc_t *sc) +{ + vmm_zsd_t *zsd = sc->vmm_zsd; + + mutex_enter(&zsd->vz_lock); + + list_remove(&zsd->vz_vmms, sc); + sc->vmm_zsd = NULL; + + mutex_exit(&zsd->vz_lock); +} + +static void * +vmm_zsd_create(zoneid_t zid) +{ + vmm_zsd_t *zsd; + zone_t *zone; + + zsd = kmem_zalloc(sizeof (*zsd), KM_SLEEP); + + list_create(&zsd->vz_vmms, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_zsd_linkage)); + + zsd->vz_zoneid = zid; + + mutex_init(&zsd->vz_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * If the vmm module is loaded while this zone is in the midst of + * shutting down, vmm_zsd_destroy() may be called without + * vmm_zsd_shutdown() ever being called. If it is shutting down, there + * is no sense in letting any in-flight VM creation succeed so set + * vz_active accordingly. + * + * zone_find_by_id_nolock() is used rather than zone_find_by_id() + * so that the zone is returned regardless of state. + */ + zone = zone_find_by_id_nolock(zid); + VERIFY(zone != NULL); + zsd->vz_active = zone_status_get(zone) < ZONE_IS_SHUTTING_DOWN; + + mutex_enter(&vmm_zsd_lock); + list_insert_tail(&vmm_zsd_list, zsd); + mutex_exit(&vmm_zsd_lock); + + return (zsd); +} + +/* + * Tells all runing VMs in the zone to poweroff. This does not reclaim guest + * resources (memory, etc.). + */ +static void +vmm_zsd_shutdown(zoneid_t zid, void *data) +{ + vmm_zsd_t *zsd = data; + vmm_softc_t *sc; + + mutex_enter(&zsd->vz_lock); + + /* + * This may already be B_FALSE. See comment in vmm_zsd_create(). If it + * is already B_FALSE we will take a quick trip through the empty list. + */ + zsd->vz_active = B_FALSE; + + for (sc = list_head(&zsd->vz_vmms); sc != NULL; + sc = list_next(&zsd->vz_vmms, sc)) { + /* Send a poweroff to the VM, whether running or not. */ + (void) vm_suspend(sc->vmm_vm, VM_SUSPEND_POWEROFF); + } + mutex_exit(&zsd->vz_lock); +} + +/* + * Reap all VMs that remain and free up guest resources. + */ +static void +vmm_zsd_destroy(zoneid_t zid, void *data) +{ + vmm_zsd_t *zsd = data; + vmm_softc_t *sc; + + mutex_enter(&vmm_zsd_lock); + list_remove(&vmm_zsd_list, zsd); + mutex_exit(&vmm_zsd_lock); + + mutex_enter(&zsd->vz_lock); + ASSERT(!zsd->vz_active); + + while ((sc = list_remove_head(&zsd->vz_vmms)) != NULL) { + int err; + + /* + * This frees all resources associated with the vm, including + * sc. + */ + err = vmm_do_vm_destroy(sc, B_FALSE); + ASSERT3S(err, ==, 0); + } + + mutex_exit(&zsd->vz_lock); + mutex_destroy(&zsd->vz_lock); + + kmem_free(zsd, sizeof (*zsd)); +} + +void +vmm_zsd_init(void) +{ + mutex_init(&vmm_zsd_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&vmm_zsd_list, sizeof (vmm_zsd_t), + offsetof(vmm_zsd_t, vz_linkage)); + zone_key_create(&vmm_zsd_key, vmm_zsd_create, vmm_zsd_shutdown, + vmm_zsd_destroy); +} + +void +vmm_zsd_fini(void) +{ + /* Calls vmm_zsd_destroy() on all zones. */ + zone_key_delete(vmm_zsd_key); + ASSERT(list_is_empty(&vmm_zsd_list)); + + list_destroy(&vmm_zsd_list); + mutex_destroy(&vmm_zsd_lock); +} diff --git a/usr/src/uts/i86pc/io/vmm/x86.c b/usr/src/uts/i86pc/io/vmm/x86.c new file mode 100644 index 0000000000..d74f866013 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/x86.c @@ -0,0 +1,645 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/pcpu.h> +#include <sys/systm.h> +#include <sys/sysctl.h> +#include <sys/x86_archext.h> + +#include <machine/clock.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/segments.h> +#include <machine/specialreg.h> + +#include <machine/vmm.h> + +#include "vmm_host.h" +#include "vmm_ktr.h" +#include "vmm_util.h" +#include "x86.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL); + +#define CPUID_VM_HIGH 0x40000000 + +static const char bhyve_id[12] = "bhyve bhyve "; + +static uint64_t bhyve_xcpuids; +SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0, + "Number of times an unknown cpuid leaf was accessed"); + +static int cpuid_leaf_b = 1; +SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN, + &cpuid_leaf_b, 0, NULL); + +/* + * Round up to the next power of two, if necessary, and then take log2. + * Returns -1 if argument is zero. + */ +static __inline int +log2(u_int x) +{ + + return (fls(x << (1 - powerof2(x))) - 1); +} + +int +x86_emulate_cpuid(struct vm *vm, int vcpu_id, + uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + const struct xsave_limits *limits; + uint64_t cr4; + int error, enable_invpcid, level, width = 0, x2apic_id = 0; + unsigned int func, regs[4], logical_cpus = 0; + enum x2apic_state x2apic_state; + uint16_t cores, maxcpus, sockets, threads; + + VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx); + + /* + * Requests for invalid CPUID levels should map to the highest + * available level instead. + */ + if (cpu_exthigh != 0 && *eax >= 0x80000000) { + if (*eax > cpu_exthigh) + *eax = cpu_exthigh; + } else if (*eax >= 0x40000000) { + if (*eax > CPUID_VM_HIGH) + *eax = CPUID_VM_HIGH; + } else if (*eax > cpu_high) { + *eax = cpu_high; + } + + func = *eax; + + /* + * In general the approach used for CPU topology is to + * advertise a flat topology where all CPUs are packages with + * no multi-core or SMT. + */ + switch (func) { + /* + * Pass these through to the guest + */ + case CPUID_0000_0000: + case CPUID_0000_0002: + case CPUID_0000_0003: + case CPUID_8000_0000: + case CPUID_8000_0002: + case CPUID_8000_0003: + case CPUID_8000_0004: + case CPUID_8000_0006: + cpuid_count(*eax, *ecx, regs); + break; + case CPUID_8000_0008: + cpuid_count(*eax, *ecx, regs); + if (vmm_is_amd()) { + /* + * As on Intel (0000_0007:0, EDX), mask out + * unsupported or unsafe AMD extended features + * (8000_0008 EBX). + */ + regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | + AMDFEID_XSAVEERPTR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + /* + * Here, width is ApicIdCoreIdSize, present on + * at least Family 15h and newer. It + * represents the "number of bits in the + * initial apicid that indicate thread id + * within a package." + * + * Our topo_probe_amd() uses it for + * pkg_id_shift and other OSes may rely on it. + */ + width = MIN(0xF, log2(threads * cores)); + if (width < 0x4) + width = 0; + logical_cpus = MIN(0xFF, threads * cores - 1); + regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus; + } + break; + + case CPUID_8000_0001: + cpuid_count(*eax, *ecx, regs); + + /* + * Hide SVM from guest. + */ + regs[2] &= ~AMDID2_SVM; + + /* + * Don't advertise extended performance counter MSRs + * to the guest. + */ + regs[2] &= ~AMDID2_PCXC; + regs[2] &= ~AMDID2_PNXC; + regs[2] &= ~AMDID2_PTSCEL2I; + + /* + * Don't advertise Instruction Based Sampling feature. + */ + regs[2] &= ~AMDID2_IBS; + + /* NodeID MSR not available */ + regs[2] &= ~AMDID2_NODE_ID; + + /* Don't advertise the OS visible workaround feature */ + regs[2] &= ~AMDID2_OSVW; + + /* Hide mwaitx/monitorx capability from the guest */ + regs[2] &= ~AMDID2_MWAITX; + +#ifndef __FreeBSD__ + /* + * Detection routines for TCE and FFXSR are missing + * from our vm_cpuid_capability() detection logic + * today. Mask them out until that is remedied. + * They do not appear to be in common usage, so their + * absence should not cause undue trouble. + */ + regs[2] &= ~AMDID2_TCE; + regs[3] &= ~AMDID_FFXSR; +#endif + + /* + * Hide rdtscp/ia32_tsc_aux until we know how + * to deal with them. + */ + regs[3] &= ~AMDID_RDTSCP; + break; + + case CPUID_8000_0007: + /* + * AMD uses this leaf to advertise the processor's + * power monitoring and RAS capabilities. These + * features are hardware-specific and exposing + * them to a guest doesn't make a lot of sense. + * + * Intel uses this leaf only to advertise the + * "Invariant TSC" feature with all other bits + * being reserved (set to zero). + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* + * "Invariant TSC" can be advertised to the guest if: + * - host TSC frequency is invariant + * - host TSCs are synchronized across physical cpus + * + * XXX This still falls short because the vcpu + * can observe the TSC moving backwards as it + * migrates across physical cpus. But at least + * it should discourage the guest from using the + * TSC to keep track of time. + */ +#ifdef __FreeBSD__ + /* XXXJOY: Wire up with our own TSC logic */ + if (tsc_is_invariant && smp_tsc) + regs[3] |= AMDPM_TSC_INVARIANT; +#endif /* __FreeBSD__ */ + break; + + case CPUID_8000_001D: + /* AMD Cache topology, like 0000_0004 for Intel. */ + if (!vmm_is_amd()) + goto default_leaf; + + /* + * Similar to Intel, generate a ficticious cache + * topology for the guest with L3 shared by the + * package, and L1 and L2 local to a core. + */ + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + switch (*ecx) { + case 0: + logical_cpus = threads; + level = 1; + func = 1; /* data cache */ + break; + case 1: + logical_cpus = threads; + level = 2; + func = 3; /* unified cache */ + break; + case 2: + logical_cpus = threads * cores; + level = 3; + func = 3; /* unified cache */ + break; + default: + logical_cpus = 0; + level = 0; + func = 0; + break; + } + + logical_cpus = MIN(0xfff, logical_cpus - 1); + regs[0] = (logical_cpus << 14) | (1 << 8) | + (level << 5) | func; + regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_8000_001E: + /* AMD Family 16h+ additional identifiers */ + if (!vmm_is_amd() || CPUID_TO_FAMILY(cpu_id) < 0x16) + goto default_leaf; + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] = vcpu_id; + threads = MIN(0xFF, threads - 1); + regs[1] = (threads << 8) | + (vcpu_id >> log2(threads + 1)); + /* + * XXX Bhyve topology cannot yet represent >1 node per + * processor. + */ + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_0001: + do_cpuid(1, regs); + + error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); + if (error) { + panic("x86_emulate_cpuid: error %d " + "fetching x2apic state", error); + } + + /* + * Override the APIC ID only in ebx + */ + regs[1] &= ~(CPUID_LOCAL_APIC_ID); + regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); + + /* + * Don't expose VMX, SpeedStep, TME or SMX capability. + * Advertise x2APIC capability and Hypervisor guest. + */ + regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); + regs[2] &= ~(CPUID2_SMX); + + regs[2] |= CPUID2_HV; + + if (x2apic_state != X2APIC_DISABLED) + regs[2] |= CPUID2_X2APIC; + else + regs[2] &= ~CPUID2_X2APIC; + + /* + * Only advertise CPUID2_XSAVE in the guest if + * the host is using XSAVE. + */ + if (!(regs[2] & CPUID2_OSXSAVE)) + regs[2] &= ~CPUID2_XSAVE; + + /* + * If CPUID2_XSAVE is being advertised and the + * guest has set CR4_XSAVE, set + * CPUID2_OSXSAVE. + */ + regs[2] &= ~CPUID2_OSXSAVE; + if (regs[2] & CPUID2_XSAVE) { + error = vm_get_register(vm, vcpu_id, + VM_REG_GUEST_CR4, &cr4); + if (error) + panic("x86_emulate_cpuid: error %d " + "fetching %%cr4", error); + if (cr4 & CR4_XSAVE) + regs[2] |= CPUID2_OSXSAVE; + } + + /* + * Hide monitor/mwait until we know how to deal with + * these instructions. + */ + regs[2] &= ~CPUID2_MON; + + /* + * Hide the performance and debug features. + */ + regs[2] &= ~CPUID2_PDCM; + + /* + * No TSC deadline support in the APIC yet + */ + regs[2] &= ~CPUID2_TSCDLT; + + /* + * Hide thermal monitoring + */ + regs[3] &= ~(CPUID_ACPI | CPUID_TM); + + /* + * Hide the debug store capability. + */ + regs[3] &= ~CPUID_DS; + + /* + * Advertise the Machine Check and MTRR capability. + * + * Some guest OSes (e.g. Windows) will not boot if + * these features are absent. + */ + regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + logical_cpus = threads * cores; + regs[1] &= ~CPUID_HTT_CORES; + regs[1] |= (logical_cpus & 0xff) << 16; + regs[3] |= CPUID_HTT; + break; + + case CPUID_0000_0004: + cpuid_count(*eax, *ecx, regs); + + if (regs[0] || regs[1] || regs[2] || regs[3]) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] &= 0x3ff; + regs[0] |= (cores - 1) << 26; + /* + * Cache topology: + * - L1 and L2 are shared only by the logical + * processors in a single core. + * - L3 and above are shared by all logical + * processors in the package. + */ + logical_cpus = threads; + level = (regs[0] >> 5) & 0x7; + if (level >= 3) + logical_cpus *= cores; + regs[0] |= (logical_cpus - 1) << 14; + } + break; + + case CPUID_0000_0007: + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* leaf 0 */ + if (*ecx == 0) { + cpuid_count(*eax, *ecx, regs); + + /* Only leaf 0 is supported */ + regs[0] = 0; + + /* + * Expose known-safe features. + */ + regs[1] &= (CPUID_STDEXT_FSGSBASE | + CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | + CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 | + CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | + CPUID_STDEXT_AVX512F | + CPUID_STDEXT_RDSEED | + CPUID_STDEXT_AVX512PF | + CPUID_STDEXT_AVX512ER | + CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA); + regs[2] = 0; + regs[3] &= CPUID_STDEXT3_MD_CLEAR; + + /* Advertise INVPCID if it is enabled. */ + error = vm_get_capability(vm, vcpu_id, + VM_CAP_ENABLE_INVPCID, &enable_invpcid); + if (error == 0 && enable_invpcid) + regs[1] |= CPUID_STDEXT_INVPCID; + } + break; + + case CPUID_0000_0006: + regs[0] = CPUTPM1_ARAT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000A: + /* + * Handle the access, but report 0 for + * all options + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000B: + /* + * Intel processor topology enumeration + */ + if (vmm_is_intel()) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + if (*ecx == 0) { + logical_cpus = threads; + width = log2(logical_cpus); + level = CPUID_TYPE_SMT; + x2apic_id = vcpu_id; + } + + if (*ecx == 1) { + logical_cpus = threads * cores; + width = log2(logical_cpus); + level = CPUID_TYPE_CORE; + x2apic_id = vcpu_id; + } + + if (!cpuid_leaf_b || *ecx >= 2) { + width = 0; + logical_cpus = 0; + level = 0; + x2apic_id = 0; + } + + regs[0] = width & 0x1f; + regs[1] = logical_cpus & 0xffff; + regs[2] = (level << 8) | (*ecx & 0xff); + regs[3] = x2apic_id; + } else { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + + case CPUID_0000_000D: + limits = vmm_get_xsave_limits(); + if (!limits->xsave_enabled) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + } + + cpuid_count(*eax, *ecx, regs); + switch (*ecx) { + case 0: + /* + * Only permit the guest to use bits + * that are active in the host in + * %xcr0. Also, claim that the + * maximum save area size is + * equivalent to the host's current + * save area size. Since this runs + * "inside" of vmrun(), it runs with + * the guest's xcr0, so the current + * save area size is correct as-is. + */ + regs[0] &= limits->xcr0_allowed; + regs[2] = limits->xsave_max_size; + regs[3] &= (limits->xcr0_allowed >> 32); + break; + case 1: + /* Only permit XSAVEOPT. */ + regs[0] &= CPUID_EXTSTATE_XSAVEOPT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + default: + /* + * If the leaf is for a permitted feature, + * pass through as-is, otherwise return + * all zeroes. + */ + if (!(limits->xcr0_allowed & (1ul << *ecx))) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + } + break; + + case 0x40000000: + regs[0] = CPUID_VM_HIGH; + bcopy(bhyve_id, ®s[1], 4); + bcopy(bhyve_id + 4, ®s[2], 4); + bcopy(bhyve_id + 8, ®s[3], 4); + break; + + default: +default_leaf: + /* + * The leaf value has already been clamped so + * simply pass this through, keeping count of + * how many unhandled leaf values have been seen. + */ + atomic_add_long(&bhyve_xcpuids, 1); + cpuid_count(*eax, *ecx, regs); + break; + } + + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; + + return (1); +} + +bool +vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap) +{ + bool rv; + + KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d", + __func__, cap)); + + /* + * Simply passthrough the capabilities of the host cpu for now. + */ + rv = false; + switch (cap) { +#ifdef __FreeBSD__ + case VCC_NO_EXECUTE: + if (amd_feature & AMDID_NX) + rv = true; + break; + case VCC_FFXSR: + if (amd_feature & AMDID_FFXSR) + rv = true; + break; + case VCC_TCE: + if (amd_feature2 & AMDID2_TCE) + rv = true; + break; +#else + case VCC_NO_EXECUTE: + if (is_x86_feature(x86_featureset, X86FSET_NX)) + rv = true; + break; + /* XXXJOY: No kernel detection for FFXR or TCE at present, so ignore */ + case VCC_FFXSR: + case VCC_TCE: + break; +#endif + default: + panic("%s: unknown vm_cpu_capability %d", __func__, cap); + } + return (rv); +} diff --git a/usr/src/uts/i86pc/io/vmm/x86.h b/usr/src/uts/i86pc/io/vmm/x86.h new file mode 100644 index 0000000000..0d70c04fd8 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/x86.h @@ -0,0 +1,82 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _X86_H_ +#define _X86_H_ + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_0006 (0x6) +#define CPUID_0000_0007 (0x7) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_0000_000D (0xD) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) +#define CPUID_8000_001D (0x8000001D) +#define CPUID_8000_001E (0x8000001E) + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_MASK (0xff<<24) +#define CPUID_0000_0001_APICID_SHIFT 24 + +/* + * CPUID instruction Fn0000_0001 ECX + */ +#define CPUID_0000_0001_FEAT0_VMX (1<<5) + +int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx); + +enum vm_cpuid_capability { + VCC_NONE, + VCC_NO_EXECUTE, + VCC_FFXSR, + VCC_TCE, + VCC_LAST +}; + +/* + * Return 'true' if the capability 'cap' is enabled in this virtual cpu + * and 'false' otherwise. + */ +bool vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability); +#endif diff --git a/usr/src/uts/i86pc/ml/hma_asm.s b/usr/src/uts/i86pc/ml/hma_asm.s new file mode 100644 index 0000000000..49afbdd240 --- /dev/null +++ b/usr/src/uts/i86pc/ml/hma_asm.s @@ -0,0 +1,52 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + + +#include <sys/asm_linkage.h> + + ENTRY_NP(hma_vmx_vmxon) + push %rbp + movq %rsp, %rbp + pushq %rdi + + xorl %eax, %eax + vmxon -0x8(%rbp) + ja 1f /* CF=0, ZF=0 (success) */ + incl %eax +1: + + leave + ret + SET_SIZE(hma_vmx_vmxon) + + ENTRY_NP(hma_vmx_do_invept) + push %rbp + movq %rsp, %rbp + pushq %rdi + pushq %rsi + + /* build INVEPT descriptor on stack */ + xorl %eax, %eax + pushq %rax; + pushq %rsi + + invept (%rsp), %rdi + ja 1f /* CF=0, ZF=0 (success) */ + incl %eax +1: + + leave + ret + SET_SIZE(hma_vmx_do_invept) diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s index 737908b638..a036eefee1 100644 --- a/usr/src/uts/i86pc/ml/kpti_trampolines.s +++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s @@ -674,6 +674,8 @@ tr_intr_ret_end: MK_INTR_TRAMPOLINE_NOERR(invaltrap) MK_INTR_TRAMPOLINE_NOERR(fasttrap) MK_INTR_TRAMPOLINE_NOERR(dtrace_ret) + MK_INTR_TRAMPOLINE_NOERR(brand_sys_int80) + MK_INTR_TRAMPOLINE_NOERR(sys_int80) /* * These are special because they can interrupt other traps, and diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index 7e4d2bdec3..475c5bac36 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -1,7 +1,7 @@ \ \ Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. \ Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. -\ Copyright 2018 Joyent, Inc. +\ Copyright 2019 Joyent, Inc. \ \ CDDL HEADER START \ @@ -88,7 +88,6 @@ _kthread THREAD_SIZE t_lockstat t_lockp t_lock_flush - t_kpri_req t_oldspl t_pri t_pil @@ -128,9 +127,6 @@ _kthread THREAD_SIZE t_useracc #endif -ctxop - save_op CTXOP_SAVE - as a_hat @@ -150,6 +146,7 @@ _klwp lwp_thread lwp_procp lwp_brand + lwp_brand_syscall lwp_eosys lwp_regs lwp_arg diff --git a/usr/src/uts/i86pc/ml/syscall_asm.s b/usr/src/uts/i86pc/ml/syscall_asm.s index c604d14c08..5bb6bdea31 100644 --- a/usr/src/uts/i86pc/ml/syscall_asm.s +++ b/usr/src/uts/i86pc/ml/syscall_asm.s @@ -632,6 +632,36 @@ _sysenter_done: sysexit SET_SIZE(sys_sysenter) SET_SIZE(brand_sys_sysenter) +#endif /* __lint */ + +#if defined(__lint) +/* + * System call via an int80. This entry point is only used by the Linux + * application environment. Unlike the sysenter path, there is no default + * action to take if no callback is registered for this process. + */ +void +sys_int80() +{} + +#else /* __lint */ + + ENTRY_NP(brand_sys_int80) + BRAND_CALLBACK(BRAND_CB_INT80) + + ALTENTRY(sys_int80) + /* + * We hit an int80, but this process isn't of a brand with an int80 + * handler. Bad process! Make it look as if the INT failed. + * Modify %eip to point before the INT, push the expected error + * code and fake a GP fault. + * + */ + subl $2, (%esp) /* int insn 2-bytes */ + pushl $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) + jmp gptrap / GP fault + SET_SIZE(sys_int80) + SET_SIZE(brand_sys_int80) /* * Declare a uintptr_t which covers the entire pc range of syscall diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s index a5f4964ba2..9ef517e2f6 100644 --- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s +++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s @@ -544,6 +544,7 @@ noprod_sys_syscall: movq T_LWP(%r15), %r14 ASSERT_NO_RUPDATE_PENDING(%r14) + ENABLE_INTR_FLAGS MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) @@ -557,6 +558,37 @@ noprod_sys_syscall: incq %gs:CPU_STATS_SYS_SYSCALL + /* + * If our LWP has an alternate system call handler, run that instead of + * the regular system call path. + */ + movq LWP_BRAND_SYSCALL(%r14), %rdi + testq %rdi, %rdi + jz _syscall_no_brand + + pushq %rax + subq $8, %rsp /* align stack for call to C */ + INDIRECT_CALL_REG(rdi) + addq $8, %rsp + + /* + * If the alternate handler returns non-zero, the normal system call + * processing is resumed. + */ + testl %eax, %eax + popq %rax + jnz _syscall_no_brand + + /* + * For branded syscalls which were handled in-kernel, shuffle the + * register state as would be done by the native handler before jumping + * to the post-syscall logic. + */ + movq REGOFF_RAX(%rsp), %r12 + movq REGOFF_RDX(%rsp), %r13 + jmp _syscall_after_brand + +_syscall_no_brand: movw %ax, T_SYSNUM(%r15) movzbl T_PRE_SYS(%r15), %ebx ORL_SYSCALLTRACE(%ebx) @@ -592,6 +624,8 @@ _syscall_invoke: shrq $32, %r13 /* upper 32-bits into %edx */ movl %r12d, %r12d /* lower 32-bits into %eax */ 5: + +_syscall_after_brand: /* * Optimistically assume that there's no post-syscall * work to do. (This is to avoid having to call syscall_mstate() @@ -865,11 +899,46 @@ _syscall32_save: incq %gs:CPU_STATS_SYS_SYSCALL /* + * If our lwp has an alternate system call handler, run that instead + * of the regular system call path. + */ + movq LWP_BRAND_SYSCALL(%r14), %rax + testq %rax, %rax + jz _syscall32_no_brand + + movb $LWP_SYS, LWP_STATE(%r14) + INDIRECT_CALL_REG(rax) + + /* + * If the alternate handler returns non-zero, the normal system call + * processing is resumed. + */ + testl %eax, %eax + jnz _syscall32_no_brand + + /* + * For branded syscalls which were handled in-kernel, shuffle the + * register state as would be done by the native handler before jumping + * to the post-syscall logic. + */ + movl REGOFF_RAX(%rsp), %r12d + movl REGOFF_RDX(%rsp), %r13d + jmp _syscall32_after_brand + +_syscall32_no_brand: + /* * Make some space for MAXSYSARGS (currently 8) 32-bit args placed * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or * more succinctly: * * SA(MAXSYSARGS * sizeof (long)) == 64 + * + * Note, this space is used both to copy in the arguments from user + * land, but also to as part of the old UNIX style syscall_ap() method. + * syscall_entry expects that we do not change the values of this space + * that we give it. However, this means that when we end up in the more + * recent model of passing the arguments based on the calling + * conventions, we'll need to save an additional 16 bytes of stack. */ #define SYS_DROP 64 /* drop for args */ subq $SYS_DROP, %rsp @@ -897,12 +966,16 @@ _syscall32_save: */ movq %rax, %rbx - movl 0(%rsp), %edi - movl 8(%rsp), %esi - movl 0x10(%rsp), %edx - movl 0x18(%rsp), %ecx - movl 0x20(%rsp), %r8d - movl 0x28(%rsp), %r9d + movl 0x0(%rsp), %edi /* arg0 */ + movl 0x8(%rsp), %esi /* arg1 */ + movl 0x10(%rsp), %edx /* arg2 */ + movl 0x38(%rsp), %eax /* arg7 load */ + movl 0x18(%rsp), %ecx /* arg3 */ + pushq %rax /* arg7 saved to stack */ + movl 0x28(%rsp), %r8d /* arg4 */ + movl 0x38(%rsp), %eax /* arg6 load */ + movl 0x30(%rsp), %r9d /* arg5 */ + pushq %rax /* arg6 saved to stack */ movq SY_CALLC(%rbx), %rax INDIRECT_CALL_REG(rax) @@ -921,6 +994,8 @@ _syscall32_save: shrq $32, %r13 /* upper 32-bits into %edx */ movl %eax, %r12d /* lower 32-bits into %eax */ +_syscall32_after_brand: + /* * Optimistically assume that there's no post-syscall * work to do. (This is to avoid having to call syscall_mstate() @@ -1182,15 +1257,20 @@ sys_sysenter() /* * Fetch the arguments copied onto the kernel stack and put * them in the right registers to invoke a C-style syscall handler. - * %rax contains the handler address. + * %rax contains the handler address. For the last two arguments, we + * push them onto the stack -- we can't clobber the old arguments. */ movq %rax, %rbx - movl 0(%rsp), %edi - movl 8(%rsp), %esi - movl 0x10(%rsp), %edx - movl 0x18(%rsp), %ecx - movl 0x20(%rsp), %r8d - movl 0x28(%rsp), %r9d + movl 0x0(%rsp), %edi /* arg0 */ + movl 0x8(%rsp), %esi /* arg1 */ + movl 0x10(%rsp), %edx /* arg2 */ + movl 0x38(%rsp), %eax /* arg7 load */ + movl 0x18(%rsp), %ecx /* arg3 */ + pushq %rax /* arg7 saved to stack */ + movl 0x28(%rsp), %r8d /* arg4 */ + movl 0x38(%rsp), %eax /* arg6 load */ + movl 0x30(%rsp), %r9d /* arg5 */ + pushq %rax /* arg6 saved to stack */ movq SY_CALLC(%rbx), %rax INDIRECT_CALL_REG(rax) @@ -1270,6 +1350,74 @@ sys_sysenter() #endif /* __lint */ +#if defined(__lint) +/* + * System call via an int80. This entry point is only used by the Linux + * application environment. Unlike the other entry points, there is no + * default action to take if no callback is registered for this process. + */ +void +sys_int80() +{} + +#else /* __lint */ + + ENTRY_NP(brand_sys_int80) + SWAPGS /* kernel gsbase */ + XPV_TRAP_POP + call smap_enable + + /* + * We first attempt to call the "b_int80" handler from the "struct + * brand_mach_ops" for this brand. If no handler function is installed + * for this brand, the BRAND_CALLBACK() macro returns here and we + * check the lwp for a "lwp_brand_syscall" handler. + */ + BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK()) + + /* + * Check to see if this lwp provides "lwp_brand_syscall". If so, we + * will route this int80 through the regular system call handling path. + */ + movq %r15, %gs:CPU_RTMP_R15 + movq %gs:CPU_THREAD, %r15 + movq T_LWP(%r15), %r15 + movq LWP_BRAND_SYSCALL(%r15), %r15 + testq %r15, %r15 + movq %gs:CPU_RTMP_R15, %r15 + jnz nopop_syscall_int + + /* + * The brand provided neither a "b_int80", nor a "lwp_brand_syscall" + * function, and has thus opted out of handling this trap. + */ + SWAPGS /* user gsbase */ + jmp nopop_int80 + + ENTRY_NP(sys_int80) + /* + * We hit an int80, but this process isn't of a brand with an int80 + * handler. Bad process! Make it look as if the INT failed. + * Modify %rip to point before the INT, push the expected error + * code and fake a GP fault. Note on 64-bit hypervisor we need + * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack + * because gptrap will pop them again with its own XPV_TRAP_POP. + */ + XPV_TRAP_POP + call smap_enable +nopop_int80: + subq $2, (%rsp) /* int insn 2-bytes */ + pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) +#if defined(__xpv) + push %r11 + push %rcx +#endif + jmp gptrap / GP fault + SET_SIZE(sys_int80) + SET_SIZE(brand_sys_int80) +#endif /* __lint */ + + /* * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by * the generic i386 libc to do system calls. We do a small amount of setup diff --git a/usr/src/uts/i86pc/os/cpr_impl.c b/usr/src/uts/i86pc/os/cpr_impl.c index 91fb583a01..cdc1a53fb1 100644 --- a/usr/src/uts/i86pc/os/cpr_impl.c +++ b/usr/src/uts/i86pc/os/cpr_impl.c @@ -23,6 +23,10 @@ */ /* + * Copyright 2019 Joyent, Inc. + */ + +/* * Platform specific implementation code * Currently only suspend to RAM is supported (ACPI S3) */ @@ -753,6 +757,20 @@ i_cpr_is_supported(int sleeptype) if (sleeptype != CPR_TORAM) return (0); + /* + * Unfortunately, the x86 resume code was never implemented for GAS. + * The only obvious problem is that a trick necessary to appease Sun + * Studio does the wrong thing for GAS. Doubley unfortunate is that + * the condition used to detect GAS is incorrect, so we do in fact + * compile the Studio path, it just immediately fails in resume. + * + * Given that, if we were built using GCC, never allow CPR to be + * attempted. + */ +#ifdef __GNUC__ + return (0); +#else + /* * The next statement tests if a specific platform has turned off * cpr support. @@ -767,6 +785,7 @@ i_cpr_is_supported(int sleeptype) return (1); return (pm_S3_enabled); +#endif } void diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index 7796e70cd5..6c317392b3 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -5090,6 +5090,13 @@ cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out) hwcap_flags_2 |= AV_386_2_CLFLUSHOPT; } + + /* Detect systems with a potential CPUID limit */ + if (cpi->cpi_vendor == X86_VENDOR_Intel && cpi->cpi_maxeax < 4) { + cmn_err(CE_NOTE, "CPUID limit detected, " + "see the CPUID(7D) man page for details\n"); + } + /* * Check a few miscilaneous features. */ diff --git a/usr/src/uts/i86pc/os/ddi_impl.c b/usr/src/uts/i86pc/os/ddi_impl.c index 6767b4e5aa..0a856cbbf2 100644 --- a/usr/src/uts/i86pc/os/ddi_impl.c +++ b/usr/src/uts/i86pc/os/ddi_impl.c @@ -24,6 +24,7 @@ * Copyright 2012 Garrett D'Amore <garrett@damore.org> * Copyright 2014 Pluribus Networks, Inc. * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -1010,10 +1011,10 @@ page_create_io_wrapper(void *addr, size_t len, int vmflag, void *arg) #ifdef __xpv static void -segkmem_free_io(vmem_t *vmp, void * ptr, size_t size) +segkmem_free_io(vmem_t *vmp, void *ptr, size_t size) { extern void page_destroy_io(page_t *); - segkmem_xfree(vmp, ptr, size, page_destroy_io); + segkmem_xfree(vmp, ptr, size, &kvp, page_destroy_io); } #endif diff --git a/usr/src/uts/i86pc/os/gipt.c b/usr/src/uts/i86pc/os/gipt.c new file mode 100644 index 0000000000..ace7e03438 --- /dev/null +++ b/usr/src/uts/i86pc/os/gipt.c @@ -0,0 +1,566 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/gipt.h> +#include <sys/malloc.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/sunddi.h> +#include <sys/panic.h> +#include <vm/hat.h> +#include <vm/as.h> + +/* + * Generic Indexed Page Table + * + * There are several applications, such as hardware virtualization or IOMMU + * control, which require construction of a page table tree to represent a + * virtual address space. Many features of the existing htable system would be + * convenient for this, but its tight coupling to the VM system make it + * undesirable for independent consumers. The GIPT interface exists to provide + * page table allocation and indexing on top of which a table hierarchy + * (EPT, VT-d, etc) can be built by upstack logic. + * + * Types: + * + * gipt_t - Represents a single page table with a physical backing page and + * associated metadata. + * gipt_map_t - The workhorse of this facility, it contains an hash table to + * index all of the gipt_t entries which make up the page table tree. + * struct gipt_cbs - Callbacks used by the gipt_map_t: + * gipt_pte_type_cb_t - Given a PTE, emit the type (empty/page/table) + * gipt_pte_map_cb_t - Given a PFN, emit a (child) table mapping + */ + +/* + * For now, the level shifts are hard-coded to match with standard 4-level + * 64-bit paging structures. + */ + +#define GIPT_HASH(map, va, lvl) \ + ((((va) >> 12) + ((va) >> 28) + (lvl)) & ((map)->giptm_table_cnt - 1)) + +const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1] = { + 12, /* 4K */ + 21, /* 2M */ + 30, /* 1G */ + 39, /* 512G */ + 48 /* MAX */ +}; +const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1] = { + 0xfffffffffffff000ull, /* 4K */ + 0xffffffffffe00000ull, /* 2M */ + 0xffffffffc0000000ull, /* 1G */ + 0xffffff8000000000ull, /* 512G */ + 0xffff000000000000ull /* MAX */ +}; +const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1] = { + 0x0000000000001000ull, /* 4K */ + 0x0000000000200000ull, /* 2M */ + 0x0000000040000000ull, /* 1G */ + 0x0000008000000000ull, /* 512G */ + 0x0001000000000000ull /* MAX */ +}; +const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1] = { + 0x0000000000000001ull, /* 4K */ + 0x0000000000000200ull, /* 2M */ + 0x0000000000040000ull, /* 1G */ + 0x0000000008000000ull, /* 512G */ + 0x0000001000000000ull /* MAX */ +}; + +/* + * Allocate a gipt_t structure with corresponding page of memory to hold the + * PTEs which it contains. + */ +gipt_t * +gipt_alloc(void) +{ + gipt_t *pt; + void *page; + + pt = kmem_zalloc(sizeof (*pt), KM_SLEEP); + page = kmem_zalloc(PAGESIZE, KM_SLEEP); + pt->gipt_kva = page; + pt->gipt_pfn = hat_getpfnum(kas.a_hat, page); + + return (pt); +} + +/* + * Free a gipt_t structure along with its page of PTE storage. + */ +void +gipt_free(gipt_t *pt) +{ + void *page = pt->gipt_kva; + + ASSERT(pt->gipt_pfn != PFN_INVALID); + ASSERT(pt->gipt_kva != NULL); + + pt->gipt_pfn = PFN_INVALID; + pt->gipt_kva = NULL; + + kmem_free(page, PAGESIZE); + kmem_free(pt, sizeof (*pt)); +} + +/* + * Initialize a gipt_map_t with a max level (must be >= 1) and allocating its + * hash table based on a provided size (must be a power of 2). + */ +void +gipt_map_init(gipt_map_t *map, uint_t levels, uint_t hash_table_size, + const struct gipt_cbs *cbs, gipt_t *root) +{ + VERIFY(map->giptm_root == NULL); + VERIFY(map->giptm_hash == NULL); + VERIFY3U(levels, >, 0); + VERIFY3U(levels, <=, GIPT_MAX_LEVELS); + VERIFY(ISP2(hash_table_size)); + VERIFY(root != NULL); + + mutex_init(&map->giptm_lock, NULL, MUTEX_DEFAULT, NULL); + map->giptm_table_cnt = hash_table_size; + bcopy(cbs, &map->giptm_cbs, sizeof (*cbs)); + map->giptm_hash = kmem_alloc(sizeof (list_t) * map->giptm_table_cnt, + KM_SLEEP); + for (uint_t i = 0; i < hash_table_size; i++) { + list_create(&map->giptm_hash[i], sizeof (gipt_t), + offsetof(gipt_t, gipt_node)); + } + map->giptm_levels = levels; + + /* + * Insert the table root into the hash. It will be held in existence + * with an extra "valid" reference. This will prevent its clean-up + * during gipt_map_clean_parents() calls, even if it has no children. + */ + mutex_enter(&map->giptm_lock); + gipt_map_insert(map, root); + map->giptm_root = root; + root->gipt_valid_cnt++; + mutex_exit(&map->giptm_lock); +} + +/* + * Clean up a gipt_map_t by removing any lingering gipt_t entries referenced by + * it, and freeing its hash table. + */ +void +gipt_map_fini(gipt_map_t *map) +{ + const uint_t cnt = map->giptm_table_cnt; + const size_t sz = sizeof (list_t) * cnt; + + mutex_enter(&map->giptm_lock); + /* Clean up any lingering tables */ + for (uint_t i = 0; i < cnt; i++) { + list_t *list = &map->giptm_hash[i]; + gipt_t *pt; + + while ((pt = list_remove_head(list)) != NULL) { + gipt_free(pt); + } + ASSERT(list_is_empty(list)); + } + + kmem_free(map->giptm_hash, sz); + map->giptm_hash = NULL; + map->giptm_root = NULL; + map->giptm_levels = 0; + mutex_exit(&map->giptm_lock); + mutex_destroy(&map->giptm_lock); +} + +/* + * Look in the map for a gipt_t containing a given VA which is located at a + * specified level. + */ +gipt_t * +gipt_map_lookup(gipt_map_t *map, uint64_t va, uint_t lvl) +{ + gipt_t *pt; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT3U(lvl, <=, GIPT_MAX_LEVELS); + + /* + * Lookup gipt_t at the VA aligned to the next level up. For example, + * level 0 corresponds to a page table containing 512 PTEs which cover + * 4k each, spanning a total 2MB. As such, the base VA of that table + * must be aligned to the same 2MB. + */ + const uint64_t masked_va = va & gipt_level_mask[lvl + 1]; + const uint_t hash = GIPT_HASH(map, masked_va, lvl); + + /* Only the root is expected to be at the top level. */ + if (lvl == (map->giptm_levels - 1) && map->giptm_root != NULL) { + pt = map->giptm_root; + + ASSERT3U(pt->gipt_level, ==, lvl); + + /* + * It may be so that the VA in question is not covered by the + * range of the table root. + */ + if (pt->gipt_vaddr != masked_va) { + return (NULL); + } + + return (pt); + } + + list_t *list = &map->giptm_hash[hash]; + for (pt = list_head(list); pt != NULL; pt = list_next(list, pt)) { + if (pt->gipt_vaddr == masked_va && pt->gipt_level == lvl) + break; + } + return (pt); +} + +/* + * Look in the map for the deepest (lowest level) gipt_t which contains a given + * VA. This could still fail if the VA is outside the range of the table root. + */ +gipt_t * +gipt_map_lookup_deepest(gipt_map_t *map, uint64_t va) +{ + gipt_t *pt = NULL; + uint_t lvl; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + for (lvl = 0; lvl < map->giptm_levels; lvl++) { + pt = gipt_map_lookup(map, va, lvl); + if (pt != NULL) { + break; + } + } + return (pt); +} + +/* + * Given a VA inside a gipt_t, calculate (based on the level of that PT) the VA + * corresponding to the next entry in the table. It returns 0 if that VA would + * fall beyond the bounds of the table. + */ +static __inline__ uint64_t +gipt_next_va(gipt_t *pt, uint64_t va) +{ + const uint_t lvl = pt->gipt_level; + const uint64_t masked = va & gipt_level_mask[lvl]; + const uint64_t max = pt->gipt_vaddr + gipt_level_size[lvl+1]; + const uint64_t next = masked + gipt_level_size[lvl]; + + ASSERT3U(masked, >=, pt->gipt_vaddr); + ASSERT3U(masked, <, max); + + /* + * If the "next" VA would be outside this table, including cases where + * it overflowed, indicate an error result. + */ + if (next >= max || next <= masked) { + return (0); + } + return (next); +} + +/* + * For a given VA, find the next VA which corresponds to a valid page mapping. + * The gipt_t containing that VA will be indicated via 'ptp'. (The gipt_t of + * the starting VA can be passed in via 'ptp' for a minor optimization). If + * there is no valid mapping higher than 'va' but contained within 'max_va', + * then this will indicate failure with 0 returned. + */ +uint64_t +gipt_map_next_page(gipt_map_t *map, uint64_t va, uint64_t max_va, gipt_t **ptp) +{ + gipt_t *pt = *ptp; + uint64_t cur_va = va; + gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT3U(max_va, !=, 0); + ASSERT3U(ptp, !=, NULL); + + /* + * If a starting table is not provided, search the map for the deepest + * table which contains the VA. If for some reason that VA is beyond + * coverage of the map root, indicate failure. + */ + if (pt == NULL) { + pt = gipt_map_lookup_deepest(map, cur_va); + if (pt == NULL) { + goto fail; + } + } + + /* + * From the starting table (at whatever level that may reside), walk + * forward through the PTEs looking for a valid page mapping. + */ + while (cur_va < max_va) { + const uint64_t next_va = gipt_next_va(pt, cur_va); + if (next_va == 0) { + /* + * The end of this table has been reached. Ascend one + * level to continue the walk if possible. If already + * at the root, the end of the table means failure. + */ + if (pt->gipt_level >= map->giptm_levels) { + goto fail; + } + pt = gipt_map_lookup(map, cur_va, pt->gipt_level + 1); + if (pt == NULL) { + goto fail; + } + continue; + } else if (next_va >= max_va) { + /* + * Terminate the walk with a failure if the VA + * corresponding to the next PTE is beyond the max. + */ + goto fail; + } + cur_va = next_va; + + const uint64_t pte = GIPT_VA2PTE(pt, cur_va); + const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level); + if (ptet == PTET_EMPTY) { + continue; + } else if (ptet == PTET_PAGE) { + /* A valid page mapping: success. */ + *ptp = pt; + return (cur_va); + } else if (ptet == PTET_LINK) { + /* + * A child page table is present at this PTE. Look it + * up from the map. + */ + ASSERT3U(pt->gipt_level, >, 0); + pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1); + ASSERT3P(pt, !=, NULL); + break; + } else { + panic("unexpected PTE type %x @ va %p", ptet, cur_va); + } + } + + /* + * By this point, the above loop has located a table structure to + * descend into in order to find the next page. + */ + while (cur_va < max_va) { + const uint64_t pte = GIPT_VA2PTE(pt, cur_va); + const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level); + + if (ptet == PTET_EMPTY) { + const uint64_t next_va = gipt_next_va(pt, cur_va); + if (next_va == 0 || next_va >= max_va) { + goto fail; + } + cur_va = next_va; + continue; + } else if (ptet == PTET_PAGE) { + /* A valid page mapping: success. */ + *ptp = pt; + return (cur_va); + } else if (ptet == PTET_LINK) { + /* + * A child page table is present at this PTE. Look it + * up from the map. + */ + ASSERT3U(pt->gipt_level, >, 0); + pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1); + ASSERT3P(pt, !=, NULL); + } else { + panic("unexpected PTE type %x @ va %p", ptet, cur_va); + } + } + +fail: + *ptp = NULL; + return (0); +} + +/* + * Insert a gipt_t into the map based on its VA and level. It is up to the + * caller to ensure that a duplicate entry does not already exist in the map. + */ +void +gipt_map_insert(gipt_map_t *map, gipt_t *pt) +{ + const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level); + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT(gipt_map_lookup(map, pt->gipt_vaddr, pt->gipt_level) == NULL); + VERIFY3U(pt->gipt_level, <, map->giptm_levels); + + list_insert_head(&map->giptm_hash[hash], pt); +} + +/* + * Remove a gipt_t from the map. + */ +void +gipt_map_remove(gipt_map_t *map, gipt_t *pt) +{ + const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level); + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + list_remove(&map->giptm_hash[hash], pt); +} + +/* + * Given a VA, create any missing gipt_t entries from the specified level all + * the way up to (but not including) the root. This is done from lowest level + * to highest, and stops when an existing table covering that VA is found. + * References to any created gipt_t tables, plus the final "found" gipt_t are + * stored in 'pts'. The number of gipt_t pointers stored to 'pts' serves as + * the return value (1 <= val <= root level). It is up to the caller to + * populate linking PTEs to the newly created empty tables. + */ +static uint_t +gipt_map_ensure_chain(gipt_map_t *map, uint64_t va, uint_t lvl, gipt_t **pts) +{ + const uint_t root_lvl = map->giptm_root->gipt_level; + uint_t clvl = lvl, count = 0; + gipt_t *child_pt = NULL; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT3U(lvl, <, root_lvl); + ASSERT3P(map->giptm_root, !=, NULL); + + do { + const uint64_t pva = (va & gipt_level_mask[clvl + 1]); + gipt_t *pt; + + pt = gipt_map_lookup(map, pva, clvl); + if (pt != NULL) { + ASSERT3U(pva, ==, pt->gipt_vaddr); + + if (child_pt != NULL) { + child_pt->gipt_parent = pt; + } + pts[count++] = pt; + return (count); + } + + pt = gipt_alloc(); + pt->gipt_vaddr = pva; + pt->gipt_level = clvl; + if (child_pt != NULL) { + child_pt->gipt_parent = pt; + } + + gipt_map_insert(map, pt); + child_pt = pt; + pts[count++] = pt; + clvl++; + } while (clvl <= root_lvl); + + return (count); +} + +/* + * Ensure that a page table covering a VA at a specified level exists. This + * will create any necessary tables chaining up to the root as well. + */ +gipt_t * +gipt_map_create_parents(gipt_map_t *map, uint64_t va, uint_t lvl) +{ + gipt_t *pt, *pts[GIPT_MAX_LEVELS] = { 0 }; + gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type; + gipt_pte_map_cb_t pte_map = map->giptm_cbs.giptc_pte_map; + uint64_t *ptep; + uint_t i, count; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + count = gipt_map_ensure_chain(map, va, lvl, pts); + if (count == 1) { + /* Table already exists in the hierarchy */ + return (pts[0]); + } + ASSERT3U(count, >, 1); + + /* Make sure there is not already a large page mapping at the top */ + pt = pts[count - 1]; + if (pte_type(GIPT_VA2PTE(pt, va), pt->gipt_level) == PTET_PAGE) { + const uint_t end = count - 1; + + /* + * Nuke those gipt_t entries which were optimistically created + * for what was found to be a conflicted mapping. + */ + for (i = 0; i < end; i++) { + gipt_map_remove(map, pts[i]); + gipt_free(pts[i]); + } + return (NULL); + } + + /* Initialize the appropriate tables from bottom to top */ + for (i = 1; i < count; i++) { + pt = pts[i]; + ptep = GIPT_VA2PTEP(pt, va); + + /* + * Since gipt_map_ensure_chain() creates missing tables until + * it find a valid one, and that existing table has been + * checked for the existence of a large page, nothing should + * occupy this PTE. + */ + ASSERT3U(pte_type(*ptep, pt->gipt_level), ==, PTET_EMPTY); + + *ptep = pte_map(pts[i - 1]->gipt_pfn); + pt->gipt_valid_cnt++; + } + + return (pts[0]); +} + +/* + * If a page table is empty, free it from the map, as well as any parent tables + * that would subsequently become empty as part of the clean-up. As noted in + * gipt_map_init(), the table root is a special case and will remain in the + * map, even when empty. + */ +void +gipt_map_clean_parents(gipt_map_t *map, gipt_t *pt) +{ + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + while (pt->gipt_valid_cnt == 0) { + gipt_t *parent = pt->gipt_parent; + uint64_t *ptep = GIPT_VA2PTEP(parent, pt->gipt_vaddr); + + ASSERT3S(map->giptm_cbs.giptc_pte_type(*ptep, + parent->gipt_level), ==, PTET_LINK); + + /* + * For now, it is assumed that all gipt consumers consider PTE + * zeroing as an adequate action for table unmap. + */ + *ptep = 0; + + parent->gipt_valid_cnt--; + gipt_map_remove(map, pt); + gipt_free(pt); + pt = parent; + } +} diff --git a/usr/src/uts/i86pc/os/hma.c b/usr/src/uts/i86pc/os/hma.c new file mode 100644 index 0000000000..a41ff3e0d1 --- /dev/null +++ b/usr/src/uts/i86pc/os/hma.c @@ -0,0 +1,725 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/cpuvar.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/machsystm.h> +#include <sys/controlregs.h> +#include <sys/x86_archext.h> +#include <sys/id_space.h> +#include <sys/hma.h> +#include <sys/cmn_err.h> +#include <vm/hat.h> +#include <vm/as.h> + +struct hma_reg { + const char *hr_name; + list_node_t hr_node; +}; + +static kmutex_t hma_lock; +static list_t hma_registrations; +static boolean_t hma_exclusive = B_FALSE; + +static boolean_t hma_vmx_ready = B_FALSE; +static const char *hma_vmx_error = NULL; +static id_space_t *hma_vmx_vpid; + +/* + * The bulk of HMA state (VMX & SVM) is protected by cpu_lock, rather than a + * mutex specific to the module. It (cpu_lock) is already required for the + * state needed to perform setup on all CPUs, so it was a natural fit to + * protect this data too. + */ +typedef enum hma_cpu_state { + HCS_UNINITIALIZED = 0, + HCS_READY, + HCS_ERROR +} hma_cpu_state_t; +static hma_cpu_state_t hma_cpu_status[NCPU]; + +/* HMA-internal tracking of optional VMX capabilities */ +typedef enum { + HVC_EPT = (1 << 0), + HVC_VPID = (1 << 1), + HVC_INVEPT_ONE = (1 << 2), + HVC_INVEPT_ALL = (1 << 3), +} hma_vmx_capab_t; + +static void *hma_vmx_vmxon_page[NCPU]; +static uintptr_t hma_vmx_vmxon_pa[NCPU]; +static uint32_t hma_vmx_revision; +static hma_vmx_capab_t hma_vmx_capabs = 0; + +static boolean_t hma_svm_ready = B_FALSE; +static const char *hma_svm_error = NULL; +static uint32_t hma_svm_features; +static uint32_t hma_svm_max_asid; + +static void *hma_svm_hsave_page[NCPU]; +static uintptr_t hma_svm_hsave_pa[NCPU]; + +static hma_svm_asid_t hma_svm_cpu_asid[NCPU]; + + +static int hma_vmx_init(void); +static int hma_svm_init(void); + +/* Helpers from ml/hma_asm.s */ +int hma_vmx_do_invept(int, uintptr_t); +int hma_vmx_vmxon(uintptr_t); + +void +hma_init(void) +{ + mutex_init(&hma_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&hma_registrations, sizeof (struct hma_reg), + offsetof(struct hma_reg, hr_node)); + + switch (cpuid_getvendor(CPU)) { + case X86_VENDOR_Intel: + (void) hma_vmx_init(); + break; + case X86_VENDOR_AMD: + (void) hma_svm_init(); + break; + default: + break; + } +} + +static hma_reg_t * +hma_register_backend(const char *name) +{ + struct hma_reg *reg; + boolean_t is_ready; + + ASSERT(MUTEX_HELD(&hma_lock)); + + switch (cpuid_getvendor(CPU)) { + case X86_VENDOR_Intel: + is_ready = hma_vmx_ready; + break; + case X86_VENDOR_AMD: + is_ready = hma_svm_ready; + break; + default: + is_ready = B_FALSE; + break; + } + + if (!is_ready) + return (NULL); + + reg = kmem_zalloc(sizeof (*reg), KM_SLEEP); + reg->hr_name = name; + list_insert_tail(&hma_registrations, reg); + + return (reg); +} + +hma_reg_t * +hma_register(const char *name) +{ + struct hma_reg *reg = NULL; + + VERIFY(name != NULL); + + mutex_enter(&hma_lock); + + if (!hma_exclusive) + reg = hma_register_backend(name); + + mutex_exit(&hma_lock); + + return (reg); +} + +hma_reg_t * +hma_register_exclusive(const char *name) +{ + struct hma_reg *reg = NULL; + + VERIFY(name != NULL); + + mutex_enter(&hma_lock); + + if (list_is_empty(&hma_registrations)) { + reg = hma_register_backend(name); + if (reg != NULL) + hma_exclusive = B_TRUE; + } + + mutex_exit(&hma_lock); + + return (reg); +} + +void +hma_unregister(hma_reg_t *reg) +{ + VERIFY(reg != NULL); + VERIFY(!list_is_empty(&hma_registrations)); + + mutex_enter(&hma_lock); + list_remove(&hma_registrations, reg); + if (hma_exclusive && list_is_empty(&hma_registrations)) + hma_exclusive = B_FALSE; + mutex_exit(&hma_lock); + kmem_free(reg, sizeof (*reg)); +} + +/* + * VPID 0 is reserved for instances where VPID is disabled. Some hypervisors + * (read: bhyve) reserve lower-order VPIDs for use in fallback behavior if + * unique VPIDs could not be allocated for all the vCPUs belonging to a VM. + */ +#define HMA_VPID_RESERVED NCPU + +uint16_t +hma_vmx_vpid_alloc(void) +{ + id_t res; + + /* Do not bother if the CPU lacks support */ + if ((hma_vmx_capabs & HVC_VPID) == 0) { + return (0); + } + + res = id_alloc_nosleep(hma_vmx_vpid); + if (res == -1) { + return (0); + } else { + ASSERT(res > HMA_VPID_RESERVED && res <= UINT16_MAX); + return (res); + } +} + +void +hma_vmx_vpid_free(uint16_t vpid) +{ + VERIFY(vpid > HMA_VPID_RESERVED); + id_free(hma_vmx_vpid, (id_t)vpid); +} + +#define INVEPT_SINGLE_CONTEXT 1 +#define INVEPT_ALL_CONTEXTS 2 + +static int +hma_vmx_invept_xcall(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3 __unused) +{ + int flag = (int)arg1; + uintptr_t eptp = (uintptr_t)arg2; + + ASSERT(flag == INVEPT_SINGLE_CONTEXT || flag == INVEPT_ALL_CONTEXTS); + + VERIFY0(hma_vmx_do_invept(flag, eptp)); + return (0); +} + +void +hma_vmx_invept_allcpus(uintptr_t eptp) +{ + int flag = -1; + cpuset_t set; + + if ((hma_vmx_capabs & HVC_INVEPT_ONE) != 0) { + flag = INVEPT_SINGLE_CONTEXT; + } else if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) { + flag = INVEPT_ALL_CONTEXTS; + eptp = 0; + } else { + return; + } + + cpuset_zero(&set); + mutex_enter(&cpu_lock); + + cpuset_or(&set, &cpu_active_set); + xc_call((xc_arg_t)flag, (xc_arg_t)eptp, 0, CPUSET2BV(set), + hma_vmx_invept_xcall); + + mutex_exit(&cpu_lock); +} + +static int +hma_vmx_cpu_vmxon(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused, + xc_arg_t arg3 __unused) +{ + uint64_t fctrl; + processorid_t id = CPU->cpu_seqid; + void *vmxon_region = hma_vmx_vmxon_page[id]; + uintptr_t vmxon_pa = hma_vmx_vmxon_pa[id]; + + VERIFY(vmxon_region != NULL && vmxon_pa != 0); + + /* + * Ensure that the VMX support and lock bits are enabled in the + * feature-control MSR. + */ + fctrl = rdmsr(MSR_IA32_FEAT_CTRL); + if ((fctrl & IA32_FEAT_CTRL_LOCK) == 0 || + (fctrl & IA32_FEAT_CTRL_VMX_EN) == 0) { + fctrl = fctrl | IA32_FEAT_CTRL_VMX_EN | IA32_FEAT_CTRL_LOCK; + wrmsr(MSR_IA32_FEAT_CTRL, fctrl); + } + + setcr4(getcr4() | CR4_VMXE); + + if (hma_vmx_vmxon(vmxon_pa) == 0) { + hma_cpu_status[id] = HCS_READY; + } else { + hma_cpu_status[id] = HCS_ERROR; + + /* + * If VMX has already been marked active and available for the + * system, then failure to perform VMXON on a newly-onlined CPU + * represents a fatal problem. Continuing on would mean + * failure for any hypervisor thread which landed here. + */ + if (hma_vmx_ready) { + panic("VMXON failure after VMX marked ready"); + } + } + return (0); +} + +static int +hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg __unused) +{ + hma_cpu_state_t state; + + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(id >= 0 && id < NCPU); + + if (what != CPU_ON) { + /* + * For the purposes of VMX setup, only the CPU_ON event is of + * interest. Letting VMX state linger on an offline CPU should + * not cause any harm. + * + * This logic assumes that any offlining activity is strictly + * administrative in nature and will not alter any existing + * configuration (such as %cr4 bits previously set). + */ + return (0); + } + + state = hma_cpu_status[id]; + if (state == HCS_ERROR) { + return (-1); + } + + /* Allocate the VMXON page for this CPU, if not already done */ + if (hma_vmx_vmxon_page[id] == NULL) { + caddr_t va; + pfn_t pfn; + + va = kmem_alloc(PAGESIZE, KM_SLEEP); + VERIFY0((uintptr_t)va & PAGEOFFSET); + hma_vmx_vmxon_page[id] = va; + + /* Initialize the VMX revision field as expected */ + bcopy(&hma_vmx_revision, va, sizeof (hma_vmx_revision)); + + /* + * Cache the physical address of the VMXON page rather than + * looking it up later when the potential blocking of + * hat_getpfnum would be less acceptable. + */ + pfn = hat_getpfnum(kas.a_hat, va); + hma_vmx_vmxon_pa[id] = (pfn << PAGESHIFT); + } else { + VERIFY(hma_vmx_vmxon_pa[id] != 0); + } + + if (state == HCS_UNINITIALIZED) { + cpuset_t set; + + /* Activate VMX on this CPU */ + cpuset_zero(&set); + cpuset_add(&set, id); + xc_call(0, 0, 0, CPUSET2BV(set), hma_vmx_cpu_vmxon); + } else { + VERIFY3U(state, ==, HCS_READY); + + /* + * If an already-initialized CPU is going back online, perform + * an all-contexts invept to eliminate the possibility of + * cached EPT state causing issues. + */ + if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) { + cpuset_t set; + + cpuset_zero(&set); + cpuset_add(&set, id); + xc_call((xc_arg_t)INVEPT_ALL_CONTEXTS, 0, 0, + CPUSET2BV(set), hma_vmx_invept_xcall); + } + } + + return (hma_cpu_status[id] != HCS_READY); +} + +/* + * Determining the availability of VM execution controls is somewhat different + * from conventional means, where one simply checks for asserted bits in the + * MSR value. Instead, these execution control MSRs are split into two halves: + * the lower 32-bits indicating capabilities which can be zeroed in the VMCS + * field and the upper 32-bits indicating capabilities which can be set to one. + * + * It is described in detail in Appendix A.3 of SDM volume 3. + */ +#define VMX_CTL_ONE_SETTING(val, flag) \ + (((val) & ((uint64_t)(flag) << 32)) != 0) + +static const char * +hma_vmx_query_details(void) +{ + boolean_t query_true_ctl = B_FALSE; + uint64_t msr; + + /* The basic INS/OUTS functionality is cited as a necessary prereq */ + msr = rdmsr(MSR_IA32_VMX_BASIC); + if ((msr & IA32_VMX_BASIC_INS_OUTS) == 0) { + return ("VMX does not support INS/OUTS"); + } + + /* Record the VMX revision for later VMXON usage */ + hma_vmx_revision = (uint32_t)msr; + + /* + * Bit 55 in the VMX_BASIC MSR determines how VMX control information + * can be queried. + */ + query_true_ctl = (msr & IA32_VMX_BASIC_TRUE_CTRLS) != 0; + + /* Check for EPT and VPID support */ + msr = rdmsr(query_true_ctl ? + MSR_IA32_VMX_TRUE_PROCBASED_CTLS : MSR_IA32_VMX_PROCBASED_CTLS); + if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED_2ND_CTLS)) { + msr = rdmsr(MSR_IA32_VMX_PROCBASED2_CTLS); + if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_EPT)) { + hma_vmx_capabs |= HVC_EPT; + } + if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_VPID)) { + hma_vmx_capabs |= HVC_VPID; + } + } + + /* Check for INVEPT support */ + if ((hma_vmx_capabs & HVC_EPT) != 0) { + msr = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP); + if ((msr & IA32_VMX_EPT_VPID_INVEPT) != 0) { + if ((msr & IA32_VMX_EPT_VPID_INVEPT_SINGLE) != 0) { + hma_vmx_capabs |= HVC_INVEPT_ONE; + } + if ((msr & IA32_VMX_EPT_VPID_INVEPT_ALL) != 0) { + hma_vmx_capabs |= HVC_INVEPT_ALL; + } + } + } + + return (NULL); +} + +static int +hma_vmx_init(void) +{ + cpu_t *cp; + uint64_t msr; + int err = 0; + const char *msg = NULL; + + if (!is_x86_feature(x86_featureset, X86FSET_VMX)) { + msg = "CPU does not support VMX"; + goto bail; + } + + /* Has the BIOS set the feature-control lock bit without VMX enabled? */ + msr = rdmsr(MSR_IA32_FEAT_CTRL); + if ((msr & IA32_FEAT_CTRL_LOCK) != 0 && + (msr & IA32_FEAT_CTRL_VMX_EN) == 0) { + msg = "VMX support disabled by BIOS"; + goto bail; + } + + msg = hma_vmx_query_details(); + if (msg != NULL) { + goto bail; + } + + mutex_enter(&cpu_lock); + /* Perform VMX configuration for already-online CPUs. */ + cp = cpu_active; + do { + err = hma_vmx_cpu_setup(CPU_ON, cp->cpu_seqid, NULL); + if (err != 0) { + msg = "failure during VMXON setup"; + mutex_exit(&cpu_lock); + goto bail; + } + } while ((cp = cp->cpu_next_onln) != cpu_active); + + /* + * Register callback for later-onlined CPUs and perform other remaining + * resource allocation. + */ + register_cpu_setup_func(hma_vmx_cpu_setup, NULL); + mutex_exit(&cpu_lock); + + hma_vmx_vpid = id_space_create("hma_vmx_vpid", HMA_VPID_RESERVED + 1, + UINT16_MAX); + hma_vmx_ready = B_TRUE; + + return (0); + +bail: + hma_vmx_error = msg; + cmn_err(CE_NOTE, "hma_vmx_init: %s", msg); + return (-1); +} + +#define VMCB_FLUSH_NOTHING 0x0 +#define VMCB_FLUSH_ALL 0x1 +#define VMCB_FLUSH_ASID 0x3 + +void +hma_svm_asid_init(hma_svm_asid_t *vcp) +{ + /* + * Initialize the generation to 0, forcing an ASID allocation on first + * entry. Leave the ASID at 0, so if the host forgoes the call to + * hma_svm_asid_update(), SVM will bail on the invalid vcpu state. + */ + vcp->hsa_gen = 0; + vcp->hsa_asid = 0; +} + +uint8_t +hma_svm_asid_update(hma_svm_asid_t *vcp, boolean_t flush_by_asid, + boolean_t npt_flush) +{ + hma_svm_asid_t *hcp = &hma_svm_cpu_asid[CPU->cpu_seqid]; + + ASSERT(curthread->t_preempt != 0); + + /* + * If NPT changes dictate a TLB flush and by-ASID flushing is not + * supported/used, force a fresh ASID allocation. + */ + if (npt_flush && !flush_by_asid) { + vcp->hsa_gen = 0; + } + + if (vcp->hsa_gen != hcp->hsa_gen) { + hcp->hsa_asid++; + + if (hcp->hsa_asid >= hma_svm_max_asid) { + /* Keep the ASID properly constrained */ + hcp->hsa_asid = 1; + hcp->hsa_gen++; + if (hcp->hsa_gen == 0) { + /* + * Stay clear of the '0' sentinel value for + * generation, if wrapping around. + */ + hcp->hsa_gen = 1; + } + } + vcp->hsa_gen = hcp->hsa_gen; + vcp->hsa_asid = hcp->hsa_asid; + + ASSERT(vcp->hsa_asid != 0); + ASSERT3U(vcp->hsa_asid, <, hma_svm_max_asid); + + if (flush_by_asid) { + return (VMCB_FLUSH_ASID); + } + return (VMCB_FLUSH_ALL); + } else if (npt_flush) { + ASSERT(flush_by_asid); + return (VMCB_FLUSH_ASID); + } + return (VMCB_FLUSH_NOTHING); +} + +static int +hma_svm_cpu_activate(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused, + xc_arg_t arg3 __unused) +{ + const processorid_t id = CPU->cpu_seqid; + const uintptr_t hsave_pa = hma_svm_hsave_pa[id]; + uint64_t efer; + + VERIFY(hsave_pa != 0); + + /* Enable SVM via EFER */ + efer = rdmsr(MSR_AMD_EFER); + efer |= AMD_EFER_SVME; + wrmsr(MSR_AMD_EFER, efer); + + /* Setup hsave area */ + wrmsr(MSR_AMD_VM_HSAVE_PA, hsave_pa); + + hma_cpu_status[id] = HCS_READY; + return (0); +} + +static int +hma_svm_cpu_setup(cpu_setup_t what, int id, void *arg __unused) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(id >= 0 && id < NCPU); + + switch (what) { + case CPU_CONFIG: + case CPU_ON: + case CPU_INIT: + break; + default: + /* + * Other events, such as CPU offlining, are of no interest. + * Letting the SVM state linger should not cause any harm. + * + * This logic assumes that any offlining activity is strictly + * administrative in nature and will not alter any existing + * configuration (such as EFER bits previously set). + */ + return (0); + } + + /* Perform initialization if it has not been previously attempted. */ + if (hma_cpu_status[id] != HCS_UNINITIALIZED) { + return ((hma_cpu_status[id] == HCS_READY) ? 0 : -1); + } + + /* Allocate the hsave page for this CPU */ + if (hma_svm_hsave_page[id] == NULL) { + caddr_t va; + pfn_t pfn; + + va = kmem_alloc(PAGESIZE, KM_SLEEP); + VERIFY0((uintptr_t)va & PAGEOFFSET); + hma_svm_hsave_page[id] = va; + + /* + * Cache the physical address of the hsave page rather than + * looking it up later when the potential blocking of + * hat_getpfnum would be less acceptable. + */ + pfn = hat_getpfnum(kas.a_hat, va); + hma_svm_hsave_pa[id] = (pfn << PAGESHIFT); + } else { + VERIFY(hma_svm_hsave_pa[id] != 0); + } + + kpreempt_disable(); + if (CPU->cpu_seqid == id) { + /* Perform svm setup directly if this CPU is the target */ + (void) hma_svm_cpu_activate(0, 0, 0); + kpreempt_enable(); + } else { + cpuset_t set; + + /* Use a cross-call if a remote CPU is the target */ + kpreempt_enable(); + cpuset_zero(&set); + cpuset_add(&set, id); + xc_call(0, 0, 0, CPUSET2BV(set), hma_svm_cpu_activate); + } + + return (hma_cpu_status[id] != HCS_READY); +} + +static int +hma_svm_init(void) +{ + uint64_t msr; + const char *msg = NULL; + struct cpuid_regs regs; + cpu_t *cp; + + if (!is_x86_feature(x86_featureset, X86FSET_SVM)) { + msg = "CPU does not support SVM"; + goto bail; + } + + msr = rdmsr(MSR_AMD_VM_CR); + if ((msr & AMD_VM_CR_SVMDIS) != 0) { + msg = "SVM disabled by BIOS"; + goto bail; + } + + regs.cp_eax = 0x8000000a; + (void) cpuid_insn(NULL, ®s); + const uint32_t nasid = regs.cp_ebx; + const uint32_t feat = regs.cp_edx; + + if (nasid == 0) { + msg = "Not enough ASIDs for guests"; + goto bail; + } + if ((feat & CPUID_AMD_EDX_NESTED_PAGING) == 0) { + msg = "CPU does not support nested paging"; + goto bail; + } + if ((feat & CPUID_AMD_EDX_NRIPS) == 0) { + msg = "CPU does not support NRIP save"; + goto bail; + } + + hma_svm_features = feat; + hma_svm_max_asid = nasid; + + mutex_enter(&cpu_lock); + /* Perform SVM configuration for already-online CPUs. */ + cp = cpu_active; + do { + int err = hma_svm_cpu_setup(CPU_ON, cp->cpu_seqid, NULL); + if (err != 0) { + msg = "failure during SVM setup"; + mutex_exit(&cpu_lock); + goto bail; + } + } while ((cp = cp->cpu_next_onln) != cpu_active); + + /* + * Register callback for later-onlined CPUs and perform other remaining + * resource allocation. + */ + register_cpu_setup_func(hma_svm_cpu_setup, NULL); + mutex_exit(&cpu_lock); + + /* Initialize per-CPU ASID state. */ + for (uint_t i = 0; i < NCPU; i++) { + /* + * Skip past sentinel 0 value for generation. Doing so for + * ASID is unneeded, since it will be incremented during the + * first allocation. + */ + hma_svm_cpu_asid[i].hsa_gen = 1; + hma_svm_cpu_asid[i].hsa_asid = 0; + } + + hma_svm_ready = B_TRUE; + return (0); + +bail: + hma_svm_error = msg; + cmn_err(CE_NOTE, "hma_svm_init: %s", msg); + return (-1); +} diff --git a/usr/src/uts/i86pc/os/ibft.c b/usr/src/uts/i86pc/os/ibft.c index d9ed882705..fab1324787 100644 --- a/usr/src/uts/i86pc/os/ibft.c +++ b/usr/src/uts/i86pc/os/ibft.c @@ -39,6 +39,7 @@ #include <sys/kmem.h> #include <sys/psm.h> #include <sys/bootconf.h> +#include <sys/reboot.h> typedef enum ibft_structure_type { Reserved = 0, @@ -206,6 +207,7 @@ static ibft_status_t iscsi_parse_ibft_NIC(iscsi_ibft_nic_t *nicp); static ibft_status_t iscsi_parse_ibft_target(char *begin_of_ibft, iscsi_ibft_tgt_t *tgtp); +extern int boothowto; /* * Return value: @@ -759,7 +761,9 @@ ld_ib_prop() * 1) pass "-B ibft-noprobe=1" on kernel command line * 2) add line "set ibft_noprobe=1" in /etc/system */ - cmn_err(CE_NOTE, IBFT_NOPROBE_MSG); + if (boothowto & RB_VERBOSE) { + cmn_err(CE_NOTE, IBFT_NOPROBE_MSG); + } return; } diff --git a/usr/src/uts/i86pc/os/lgrpplat.c b/usr/src/uts/i86pc/os/lgrpplat.c index 29cea5dcbb..7e5bc36295 100644 --- a/usr/src/uts/i86pc/os/lgrpplat.c +++ b/usr/src/uts/i86pc/os/lgrpplat.c @@ -2799,7 +2799,11 @@ lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info, /* * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs * and memory are local to each other in the same NUMA node and return number - * of nodes + * of nodes. + * + * The SRAT table pointer is populated during bootup by + * build_firmware_properties() in fakebop.c. Several motherboard and BIOS + * manufacturers are guilty of not having a SRAT table. */ static int lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp, @@ -2816,9 +2820,15 @@ lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp, /* * Nothing to do when no SRAT or disabled */ - if (tp == NULL || !lgrp_plat_srat_enable) + if (!lgrp_plat_srat_enable) return (-1); + if (tp == NULL) { + cmn_err(CE_WARN, "Couldn't read ACPI SRAT table from BIOS. " + "lgrp support will be limited to one group.\n"); + return (-1); + } + /* * Try to get domain information from MSCT table. * ACPI4.0: OSPM will use information provided by the MSCT only diff --git a/usr/src/uts/i86pc/os/machdep.c b/usr/src/uts/i86pc/os/machdep.c index d2230dd0a0..45e4d41aad 100644 --- a/usr/src/uts/i86pc/os/machdep.c +++ b/usr/src/uts/i86pc/os/machdep.c @@ -190,6 +190,12 @@ extern void pm_cfb_rele(void); extern fastboot_info_t newkernel; /* + * Instructions to enable or disable SMAP, respectively. + */ +static const uint8_t clac_instr[3] = { 0x0f, 0x01, 0xca }; +static const uint8_t stac_instr[3] = { 0x0f, 0x01, 0xcb }; + +/* * Machine dependent code to reboot. * "mdep" is interpreted as a character pointer; if non-null, it is a pointer * to a string to be used as the argument string when rebooting. @@ -1455,3 +1461,45 @@ plat_dr_disable_capability(uint64_t features) { atomic_and_64(&plat_dr_options, ~features); } + +/* + * If SMAP is supported, look through hi_calls and inline + * calls to smap_enable() to clac and smap_disable() to stac. + */ +void +hotinline_smap(hotinline_desc_t *hid) +{ + if (is_x86_feature(x86_featureset, X86FSET_SMAP) == B_FALSE) + return; + +/* + * We should never hit this since SMAP feature detection is behind + * an AMD64 header guard. + */ +#if defined(__i386) + panic("illumos only suppports SMAP on the AMD64 architecture."); +#endif + + if (strcmp(hid->hid_symname, "smap_enable") == 0) { + bcopy(clac_instr, (void *)hid->hid_instr_offset, + sizeof (clac_instr)); + } else if (strcmp(hid->hid_symname, "smap_disable") == 0) { + bcopy(stac_instr, (void *)hid->hid_instr_offset, + sizeof (stac_instr)); + } +} + +/* + * Loop through hi_calls and hand off the inlining to + * the appropriate calls. + */ +void +do_hotinlines(struct module *mp) +{ + for (hotinline_desc_t *hid = mp->hi_calls; hid != NULL; + hid = hid->hid_next) { +#if !defined(__xpv) + hotinline_smap(hid); +#endif /* __xpv */ + } +} diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c index 070c69d472..d6a8fc04cc 100644 --- a/usr/src/uts/i86pc/os/mp_startup.c +++ b/usr/src/uts/i86pc/os/mp_startup.c @@ -77,6 +77,8 @@ #include <sys/sysmacros.h> #if defined(__xpv) #include <sys/hypervisor.h> +#else +#include <sys/hma.h> #endif #include <sys/cpu_module.h> #include <sys/ontrap.h> @@ -1615,6 +1617,14 @@ done: workaround_errata_end(); cmi_post_mpstartup(); +#if !defined(__xpv) + /* + * Once other CPUs have completed startup procedures, perform + * initialization of hypervisor resources for HMA. + */ + hma_init(); +#endif + if (use_mp && ncpus != boot_max_ncpus) { cmn_err(CE_NOTE, "System detected %d cpus, but " @@ -2094,6 +2104,8 @@ cpu_sep_enable(void) ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); wrmsr(MSR_INTC_SEP_CS, (uint64_t)(uintptr_t)KCS_SEL); + + CPU->cpu_m.mcpu_fast_syscall_state |= FSS_SEP_ENABLED; } static void @@ -2107,6 +2119,8 @@ cpu_sep_disable(void) * the sysenter or sysexit instruction to trigger a #gp fault. */ wrmsr(MSR_INTC_SEP_CS, 0); + + CPU->cpu_m.mcpu_fast_syscall_state &= ~FSS_SEP_ENABLED; } static void @@ -2117,6 +2131,8 @@ cpu_asysc_enable(void) wrmsr(MSR_AMD_EFER, rdmsr(MSR_AMD_EFER) | (uint64_t)(uintptr_t)AMD_EFER_SCE); + + CPU->cpu_m.mcpu_fast_syscall_state |= FSS_ASYSC_ENABLED; } static void @@ -2131,4 +2147,6 @@ cpu_asysc_disable(void) */ wrmsr(MSR_AMD_EFER, rdmsr(MSR_AMD_EFER) & ~((uint64_t)(uintptr_t)AMD_EFER_SCE)); + + CPU->cpu_m.mcpu_fast_syscall_state &= ~FSS_ASYSC_ENABLED; } diff --git a/usr/src/uts/i86pc/os/pci_cfgspace.c b/usr/src/uts/i86pc/os/pci_cfgspace.c index 6865c26f39..c6d6c02ced 100644 --- a/usr/src/uts/i86pc/os/pci_cfgspace.c +++ b/usr/src/uts/i86pc/os/pci_cfgspace.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -289,7 +290,14 @@ pci_check_bios(void) pci_bios_mech = (ax & 0x3); pci_bios_vers = regs.ebx.word.bx; - pci_bios_maxbus = (regs.ecx.word.cx & 0xff); + + /* + * Several BIOS implementations have known problems where they don't end + * up correctly telling us to scan all PCI buses in the system. In + * particular, many on-die CPU PCI devices are on a last bus that is + * sometimes not enumerated. As such, do not trust the BIOS. + */ + pci_bios_maxbus = pci_max_nbus; switch (pci_bios_mech) { default: /* ?!? */ diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index e98f049391..64193f62d2 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -319,22 +319,16 @@ static struct seg *segmap = &kmapseg; /* easier to use name for in here */ struct seg *segkp = &kpseg; /* Pageable kernel virtual memory segment */ -#if defined(__amd64) struct seg kvseg_core; /* Segment used for the core heap */ struct seg kpmseg; /* Segment used for physical mapping */ struct seg *segkpm = &kpmseg; /* 64bit kernel physical mapping segment */ -#else -struct seg *segkpm = NULL; /* Unused on IA32 */ -#endif caddr_t segkp_base; /* Base address of segkp */ caddr_t segzio_base; /* Base address of segzio */ -#if defined(__amd64) pgcnt_t segkpsize = btop(SEGKPDEFSIZE); /* size of segkp segment in pages */ -#else -pgcnt_t segkpsize = 0; -#endif -pgcnt_t segziosize = 0; /* size of zio segment in pages */ +caddr_t segkvmm_base; +pgcnt_t segkvmmsize; +pgcnt_t segziosize; /* * A static DR page_t VA map is reserved that can map the page structures @@ -455,23 +449,32 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t); * 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap * | Kernel | * | heap | + * | | + * | | * 0xFFFFFXXX.XXX00000 |-----------------------|- kernelheap (floating) * | segmap | * 0xFFFFFXXX.XXX00000 |-----------------------|- segmap_start (floating) * | device mappings | * 0xFFFFFXXX.XXX00000 |-----------------------|- toxic_addr (floating) - * | segzio | + * | segzio | * 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating) - * | segkp | - * --- |-----------------------|- segkp_base (floating) + * | segkvmm | + * | | + * | | + * | | + * 0xFFFFFXXX.XXX00000 |-----------------------|- segkvmm_base (floating) + * | segkp | + * |-----------------------|- segkp_base (floating) * | page_t structures | valloc_base + valloc_sz * | memsegs, memlists, | * | page hash, etc. | - * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if >256GB) + * 0xFFFFFE00.00000000 |-----------------------|- valloc_base (lower if >256GB) * | segkpm | - * 0xFFFFFE00.00000000 |-----------------------| + * | | + * 0xFFFFFD00.00000000 |-----------------------|- SEGKPM_BASE (lower if >256GB) * | Red Zone | - * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if >256GB) + * 0xFFFFFC80.00000000 |-----------------------|- KERNELBASE (lower if >256GB) + * 0xFFFFFC7F.FFE00000 |-----------------------|- USERLIMIT (lower if >256GB) * | User stack |- User space memory * | | * | shared objects, etc | (grows downwards) @@ -697,6 +700,7 @@ startup_smap(void) uint32_t inst; uint8_t *instp; char sym[128]; + struct modctl *modp; extern int _smap_enable_patch_count; extern int _smap_disable_patch_count; @@ -730,8 +734,15 @@ startup_smap(void) hot_patch_kernel_text((caddr_t)instp, inst, 4); } - hot_patch_kernel_text((caddr_t)smap_enable, SMAP_CLAC_INSTR, 4); - hot_patch_kernel_text((caddr_t)smap_disable, SMAP_STAC_INSTR, 4); + /* + * Hotinline calls to smap_enable and smap_disable within + * unix module. Hotinlines in other modules are done on + * mod_load(). + */ + modp = mod_hold_by_name("unix"); + do_hotinlines(modp->mod_mp); + mod_release_mod(modp); + setcr4(getcr4() | CR4_SMAP); smap_enable(); } @@ -1076,22 +1087,9 @@ startup_memlist(void) PRM_DEBUG(memblocks); /* - * Compute maximum physical address for memory DR operations. - * Memory DR operations are unsupported on xpv or 32bit OSes. + * We no longer support any form of memory DR. */ -#ifdef __amd64 - if (plat_dr_support_memory()) { - if (plat_dr_physmax == 0) { - uint_t pabits = UINT_MAX; - - cpuid_get_addrsize(CPU, &pabits, NULL); - plat_dr_physmax = btop(1ULL << pabits); - } - if (plat_dr_physmax > PHYSMEM_MAX64) - plat_dr_physmax = PHYSMEM_MAX64; - } else -#endif - plat_dr_physmax = 0; + plat_dr_physmax = 0; /* * Examine the bios reserved memory to find out: @@ -1252,68 +1250,55 @@ startup_memlist(void) pse_table_alloc_size = pse_table_size * sizeof (pad_mutex_t); ADD_TO_ALLOCATIONS(pse_mutex, pse_table_alloc_size); -#if defined(__amd64) valloc_sz = ROUND_UP_LPAGE(valloc_sz); valloc_base = VALLOC_BASE; /* - * The default values of VALLOC_BASE and SEGKPM_BASE should work - * for values of physmax up to 256GB (1/4 TB). They need adjusting when - * memory is at addresses above 256GB. When adjusted, segkpm_base must - * be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte). + * The signicant memory-sized regions are roughly sized as follows in + * the default layout with max physmem: + * segkpm: 1x physmem allocated (but 1Tb room, below VALLOC_BASE) + * segzio: 1.5x physmem + * segkvmm: 4x physmem + * heap: whatever's left up to COREHEAP_BASE, at least 1.5x physmem * - * In the general case (>256GB), we use (4 * physmem) for the - * kernel's virtual addresses, which is divided approximately - * as follows: - * - 1 * physmem for segkpm - * - 1.5 * physmem for segzio - * - 1.5 * physmem for heap - * Total: 4.0 * physmem + * The idea is that we leave enough room to avoid fragmentation issues, + * so we would like the VA arenas to have some extra. * - * Note that the segzio and heap sizes are more than physmem so that - * VA fragmentation does not prevent either of them from being - * able to use nearly all of physmem. The value of 1.5x is determined - * experimentally and may need to change if the workload changes. + * Ignoring the loose change of segkp, valloc, and such, this means that + * as COREHEAP_BASE-VALLOC_BASE=2Tb, we can accommodate a physmem up to + * about (2Tb / 7.0), rounded down to 256Gb in the check below. + * + * Note that KPM lives below VALLOC_BASE, but we want to include it in + * adjustments, hence the 8 below. + * + * Beyond 256Gb, we push segkpm_base (and hence kernelbase and + * _userlimit) down to accommodate the VA requirements above. */ - if (physmax + 1 > mmu_btop(TERABYTE / 4) || - plat_dr_physmax > mmu_btop(TERABYTE / 4)) { - uint64_t kpm_resv_amount = mmu_ptob(physmax + 1); + if (physmax + 1 > mmu_btop(TERABYTE / 4)) { + uint64_t physmem_bytes = mmu_ptob(physmax + 1); + uint64_t adjustment = 8 * (physmem_bytes - (TERABYTE / 4)); - if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) { - kpm_resv_amount = mmu_ptob(plat_dr_physmax); - } + PRM_DEBUG(adjustment); /* - * This is what actually controls the KVA : UVA split. - * The kernel uses high VA, and this is lowering the - * boundary, thus increasing the amount of VA for the kernel. - * This gives the kernel 4 * (amount of physical memory) VA. - * - * The maximum VA is UINT64_MAX and we are using - * 64-bit 2's complement math, so e.g. if you have 512GB - * of memory, segkpm_base = -(4 * 512GB) == -2TB == - * UINT64_MAX - 2TB (approximately). So the kernel's - * VA is [UINT64_MAX-2TB to UINT64_MAX]. + * segkpm_base is always aligned on a L3 PTE boundary. */ - segkpm_base = -(P2ROUNDUP((4 * kpm_resv_amount), - KERNEL_REDZONE_SIZE)); + segkpm_base -= P2ROUNDUP(adjustment, KERNEL_REDZONE_SIZE); - /* make sure we leave some space for user apps above hole */ + /* + * But make sure we leave some space for user apps above hole. + */ segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE); - if (segkpm_base > SEGKPM_BASE) - segkpm_base = SEGKPM_BASE; - PRM_DEBUG(segkpm_base); - valloc_base = segkpm_base + P2ROUNDUP(kpm_resv_amount, ONE_GIG); + ASSERT(segkpm_base <= SEGKPM_BASE); + + valloc_base = segkpm_base + P2ROUNDUP(physmem_bytes, ONE_GIG); if (valloc_base < segkpm_base) panic("not enough kernel VA to support memory size"); - PRM_DEBUG(valloc_base); } -#else /* __i386 */ - valloc_base = (uintptr_t)(MISC_VA_BASE - valloc_sz); - valloc_base = P2ALIGN(valloc_base, mmu.level_size[1]); + + PRM_DEBUG(segkpm_base); PRM_DEBUG(valloc_base); -#endif /* __i386 */ /* * do all the initial allocations @@ -1901,73 +1886,70 @@ protect_boot_range(uintptr_t low, uintptr_t high, int setaside) } /* - * + * Establish the final size of the kernel's heap, size of segmap, segkp, etc. */ static void layout_kernel_va(void) { - PRM_POINT("layout_kernel_va() starting..."); - /* - * Establish the final size of the kernel's heap, size of segmap, - * segkp, etc. - */ + const size_t physmem_size = mmu_ptob(physmem); + size_t size; -#if defined(__amd64) + PRM_POINT("layout_kernel_va() starting..."); kpm_vbase = (caddr_t)segkpm_base; - if (physmax + 1 < plat_dr_physmax) { - kpm_size = ROUND_UP_LPAGE(mmu_ptob(plat_dr_physmax)); - } else { - kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1)); - } + kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1)); if ((uintptr_t)kpm_vbase + kpm_size > (uintptr_t)valloc_base) panic("not enough room for kpm!"); PRM_DEBUG(kpm_size); PRM_DEBUG(kpm_vbase); - /* - * By default we create a seg_kp in 64 bit kernels, it's a little - * faster to access than embedding it in the heap. - */ segkp_base = (caddr_t)valloc_base + valloc_sz; if (!segkp_fromheap) { - size_t sz = mmu_ptob(segkpsize); + size = mmu_ptob(segkpsize); /* * determine size of segkp */ - if (sz < SEGKPMINSIZE || sz > SEGKPMAXSIZE) { - sz = SEGKPDEFSIZE; + if (size < SEGKPMINSIZE || size > SEGKPMAXSIZE) { + size = SEGKPDEFSIZE; cmn_err(CE_WARN, "!Illegal value for segkpsize. " "segkpsize has been reset to %ld pages", - mmu_btop(sz)); + mmu_btop(size)); } - sz = MIN(sz, MAX(SEGKPMINSIZE, mmu_ptob(physmem))); + size = MIN(size, MAX(SEGKPMINSIZE, physmem_size)); - segkpsize = mmu_btop(ROUND_UP_LPAGE(sz)); + segkpsize = mmu_btop(ROUND_UP_LPAGE(size)); } PRM_DEBUG(segkp_base); PRM_DEBUG(segkpsize); /* - * segzio is used for ZFS cached data. It uses a distinct VA - * segment (from kernel heap) so that we can easily tell not to - * include it in kernel crash dumps on 64 bit kernels. The trick is - * to give it lots of VA, but not constrain the kernel heap. - * We can use 1.5x physmem for segzio, leaving approximately - * another 1.5x physmem for heap. See also the comment in - * startup_memlist(). + * segkvmm: backing for vmm guest memory. Like segzio, we have a + * separate segment for two reasons: it makes it easy to skip our pages + * on kernel crash dumps, and it helps avoid fragmentation. With this + * segment, we're expecting significantly-sized allocations only; we'll + * default to 4x the size of physmem. */ - segzio_base = segkp_base + mmu_ptob(segkpsize); + segkvmm_base = segkp_base + mmu_ptob(segkpsize); + size = segkvmmsize != 0 ? mmu_ptob(segkvmmsize) : (physmem_size * 4); + + size = MAX(size, SEGVMMMINSIZE); + segkvmmsize = mmu_btop(ROUND_UP_LPAGE(size)); + + PRM_DEBUG(segkvmmsize); + PRM_DEBUG(segkvmm_base); + + /* + * segzio is used for ZFS cached data. For segzio, we use 1.5x physmem. + */ + segzio_base = segkvmm_base + mmu_ptob(segkvmmsize); if (segzio_fromheap) { segziosize = 0; } else { - size_t physmem_size = mmu_ptob(physmem); - size_t size = (segziosize == 0) ? - physmem_size * 3 / 2 : mmu_ptob(segziosize); + size = (segziosize != 0) ? mmu_ptob(segziosize) : + (physmem_size * 3) / 2; - if (size < SEGZIOMINSIZE) - size = SEGZIOMINSIZE; + size = MAX(size, SEGZIOMINSIZE); segziosize = mmu_btop(ROUND_UP_LPAGE(size)); } PRM_DEBUG(segziosize); @@ -1981,10 +1963,6 @@ layout_kernel_va(void) ROUND_UP_LPAGE((uintptr_t)segzio_base + mmu_ptob(segziosize)); PRM_DEBUG(toxic_addr); segmap_start = ROUND_UP_LPAGE(toxic_addr + toxic_size); -#else /* __i386 */ - segmap_start = ROUND_UP_LPAGE(kernelbase); -#endif /* __i386 */ - PRM_DEBUG(segmap_start); /* * Users can change segmapsize through eeprom. If the variable @@ -1993,16 +1971,6 @@ layout_kernel_va(void) */ segmapsize = MAX(ROUND_UP_LPAGE(segmapsize), SEGMAPDEFAULT); -#if defined(__i386) - /* - * 32-bit systems don't have segkpm or segkp, so segmap appears at - * the bottom of the kernel's address range. Set aside space for a - * small red zone just below the start of segmap. - */ - segmap_start += KERNEL_REDZONE_SIZE; - segmapsize -= KERNEL_REDZONE_SIZE; -#endif - PRM_DEBUG(segmap_start); PRM_DEBUG(segmapsize); kernelheap = (caddr_t)ROUND_UP_LPAGE(segmap_start + segmapsize); @@ -2603,6 +2571,7 @@ add_physmem_cb(page_t *pp, pfn_t pnum) pp->p_mapping = NULL; pp->p_embed = 0; pp->p_share = 0; + pp->p_zoneid = ALL_ZONES; pp->p_mlentry = 0; } @@ -2790,11 +2759,16 @@ kvm_init(void) (void) segkmem_create(&kvseg_core); } + PRM_POINT("attaching segkvmm"); + (void) seg_attach(&kas, segkvmm_base, mmu_ptob(segkvmmsize), &kvmmseg); + (void) segkmem_create(&kvmmseg); + segkmem_kvmm_init(segkvmm_base, mmu_ptob(segkvmmsize)); + if (segziosize > 0) { PRM_POINT("attaching segzio"); (void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize), &kzioseg); - (void) segkmem_zio_create(&kzioseg); + (void) segkmem_create(&kzioseg); /* create zio area covering new segment */ segkmem_zio_init(segzio_base, mmu_ptob(segziosize)); diff --git a/usr/src/uts/i86pc/os/timestamp.c b/usr/src/uts/i86pc/os/timestamp.c index 7344e1a492..bce08717dc 100644 --- a/usr/src/uts/i86pc/os/timestamp.c +++ b/usr/src/uts/i86pc/os/timestamp.c @@ -25,7 +25,7 @@ * * Copyright 2012 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, 2016 by Delphix. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -280,10 +280,64 @@ tsc_gethrtime_tick_delta(void) return (hrt); } +/* Calculate the hrtime while exposing the parameters of that calculation. */ +hrtime_t +tsc_gethrtime_params(uint64_t *tscp, uint32_t *scalep, uint8_t *shiftp) +{ + uint32_t old_hres_lock, scale; + hrtime_t tsc, last, base; + + do { + old_hres_lock = hres_lock; + + if (gethrtimef == tsc_gethrtime_delta) { + ulong_t flags; + + flags = clear_int_flag(); + tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id]; + restore_int_flag(flags); + } else { + tsc = tsc_read(); + } + + last = tsc_last; + base = tsc_hrtime_base; + scale = nsec_scale; + + } while ((old_hres_lock & ~1) != hres_lock); + + /* See comments in tsc_gethrtime() above */ + if (tsc >= last) { + tsc -= last; + } else if (tsc >= last - 2 * tsc_max_delta) { + tsc = 0; + } else { + tsc = tsc_protect(tsc); + } + + TSC_CONVERT_AND_ADD(tsc, base, nsec_scale); + + if (tscp != NULL) { + /* + * Do not simply communicate the delta applied to the hrtime + * base, but rather the effective TSC measurement. + */ + *tscp = tsc + last; + } + if (scalep != NULL) { + *scalep = scale; + } + if (shiftp != NULL) { + *shiftp = NSEC_SHIFT; + } + + return (base); +} + /* - * This is similar to the above, but it cannot actually spin on hres_lock. - * As a result, it caches all of the variables it needs; if the variables - * don't change, it's done. + * This is similar to tsc_gethrtime_delta, but it cannot actually spin on + * hres_lock. As a result, it caches all of the variables it needs; if the + * variables don't change, it's done. */ hrtime_t dtrace_gethrtime(void) diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index f30bedd3f2..c97255fae7 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -101,6 +101,7 @@ #include <sys/hypervisor.h> #endif #include <sys/contract/process_impl.h> +#include <sys/brand.h> #define USER 0x10000 /* user-mode flag added to trap type */ @@ -861,6 +862,17 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) fault_type = F_INVAL; } + /* + * Allow the brand to interpose on invalid memory accesses + * prior to running the native pagefault handler. If this + * brand hook returns zero, it was able to handle the fault + * completely. Otherwise, drive on and call pagefault(). + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_pagefault != NULL && + BROP(p)->b_pagefault(p, lwp, addr, fault_type, rw) == 0) { + goto out; + } + res = pagefault(addr, fault_type, rw, 0); /* diff --git a/usr/src/uts/i86pc/ppt/Makefile b/usr/src/uts/i86pc/ppt/Makefile new file mode 100644 index 0000000000..f231dfddf6 --- /dev/null +++ b/usr/src/uts/i86pc/ppt/Makefile @@ -0,0 +1,86 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = ppt +OBJECTS = $(PPT_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(PPT_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/i86pc/io/vmm/io +MAPFILE = $(UTSBASE)/i86pc/io/vmm/io/ppt.mapfile + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Overrides and additions +# +ALL_BUILDS = $(ALL_BUILDSONLY64) +DEF_BUILDS = $(DEF_BUILDSONLY64) +PRE_INC_PATH = -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64 \ + -I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64 +INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(UTSBASE)/i86pc/io/vmm/io +AS_INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(OBJS_DIR) + +LDFLAGS += -dy -N drv/vmm -N misc/pcie +LDFLAGS += -M $(MAPFILE) + +$(OBJS_DIR)/ppt.o := CERRWARN += -_gcc=-Wno-unused-variable + +# needs work +SMOFF += all_func_returns + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86pc/sys/Makefile b/usr/src/uts/i86pc/sys/Makefile index 292cd04c2b..3d8332a930 100644 --- a/usr/src/uts/i86pc/sys/Makefile +++ b/usr/src/uts/i86pc/sys/Makefile @@ -37,7 +37,7 @@ include ../Makefile.i86pc # FILEMODE = 644 -HDRS= \ +CHKHDRS= \ acpidev.h \ amd_iommu.h \ asm_misc.h \ @@ -46,6 +46,7 @@ HDRS= \ ddi_subrdefs.h \ debug_info.h \ fastboot.h \ + hma.h \ mach_mmu.h \ machclock.h \ machcpuvar.h \ @@ -68,6 +69,16 @@ HDRS= \ xc_levels.h \ xsvc.h +NOCHKHDRS= \ + vmm.h \ + vmm_dev.h \ + vmm_impl.h \ + vmm_instruction_emul.h + +HDRS= \ + $(CHKHDRS) \ + $(NOCHKHDRS) + ROOTHDRS= $(HDRS:%=$(USR_PSM_ISYS_DIR)/%) ROOTDIR= $(ROOT)/usr/share/src @@ -76,7 +87,7 @@ ROOTDIRS= $(ROOTDIR)/uts $(ROOTDIR)/uts/$(PLATFORM) ROOTLINK= $(ROOTDIR)/uts/$(PLATFORM)/sys LINKDEST= ../../../../platform/$(PLATFORM)/include/sys -CHECKHDRS= $(HDRS:%.h=%.check) +CHECKHDRS= $(CHKHDRS:%.h=%.check) .KEEP_STATE: diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h index 26626ec5a4..f2528a632f 100644 --- a/usr/src/uts/i86pc/sys/apic.h +++ b/usr/src/uts/i86pc/sys/apic.h @@ -386,7 +386,7 @@ struct apic_io_intr { /* special or reserve vectors */ #define APIC_CHECK_RESERVE_VECTORS(v) \ (((v) == T_FASTTRAP) || ((v) == APIC_SPUR_INTR) || \ - ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET)) + ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET) || ((v) == 0x80)) /* cmos shutdown code for BIOS */ #define BIOS_SHUTDOWN 0x0a diff --git a/usr/src/uts/i86pc/sys/comm_page.h b/usr/src/uts/i86pc/sys/comm_page.h index 520ad9001d..ea19c856a8 100644 --- a/usr/src/uts/i86pc/sys/comm_page.h +++ b/usr/src/uts/i86pc/sys/comm_page.h @@ -27,6 +27,7 @@ extern "C" { #endif #define COMM_PAGE_SIZE PAGESIZE +#define COMM_PAGE_ALIGN 0x4000 #ifndef _ASM diff --git a/usr/src/uts/i86pc/sys/gipt.h b/usr/src/uts/i86pc/sys/gipt.h new file mode 100644 index 0000000000..4d7d523726 --- /dev/null +++ b/usr/src/uts/i86pc/sys/gipt.h @@ -0,0 +1,92 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _GIPT_H_ +#define _GIPT_H_ + +#include <sys/types.h> +#include <sys/mutex.h> +#include <sys/param.h> +#include <sys/list.h> + +struct gipt { + list_node_t gipt_node; + uint64_t gipt_vaddr; + uint64_t gipt_pfn; + uint16_t gipt_level; + uint16_t gipt_valid_cnt; + uint32_t _gipt_pad; + struct gipt *gipt_parent; + uint64_t *gipt_kva; + uint64_t _gipt_pad2; +}; +typedef struct gipt gipt_t; + +typedef enum { + PTET_EMPTY = 0, + PTET_PAGE = 1, + PTET_LINK = 2, +} gipt_pte_type_t; + +/* Given a PTE and its level, determine the type of that PTE */ +typedef gipt_pte_type_t (*gipt_pte_type_cb_t)(uint64_t, uint_t); +/* Given the PFN of a child table, emit a PTE that references it */ +typedef uint64_t (*gipt_pte_map_cb_t)(uint64_t); + +struct gipt_cbs { + gipt_pte_type_cb_t giptc_pte_type; + gipt_pte_map_cb_t giptc_pte_map; +}; + +struct gipt_map { + kmutex_t giptm_lock; + gipt_t *giptm_root; + list_t *giptm_hash; + struct gipt_cbs giptm_cbs; + size_t giptm_table_cnt; + uint_t giptm_levels; +}; +typedef struct gipt_map gipt_map_t; + +#define GIPT_HASH_SIZE_DEFAULT 0x2000 +#define GIPT_MAX_LEVELS 4 + +#define GIPT_VA2IDX(pt, va) \ + (((va) - (pt)->gipt_vaddr) >> \ + gipt_level_shift[(pt)->gipt_level]) + +#define GIPT_VA2PTE(pt, va) ((pt)->gipt_kva[GIPT_VA2IDX(pt, va)]) +#define GIPT_VA2PTEP(pt, va) (&(pt)->gipt_kva[GIPT_VA2IDX(pt, va)]) + +extern const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1]; + +extern gipt_t *gipt_alloc(void); +extern void gipt_free(gipt_t *); +extern void gipt_map_init(gipt_map_t *, uint_t, uint_t, + const struct gipt_cbs *, gipt_t *); +extern void gipt_map_fini(gipt_map_t *); +extern gipt_t *gipt_map_lookup(gipt_map_t *, uint64_t, uint_t); +extern gipt_t *gipt_map_lookup_deepest(gipt_map_t *, uint64_t); +extern uint64_t gipt_map_next_page(gipt_map_t *, uint64_t, uint64_t, + gipt_t **); +extern void gipt_map_insert(gipt_map_t *, gipt_t *); +extern void gipt_map_remove(gipt_map_t *, gipt_t *); +extern gipt_t *gipt_map_create_parents(gipt_map_t *, uint64_t, uint_t); +extern void gipt_map_clean_parents(gipt_map_t *, gipt_t *); + +#endif /* _GIPT_H_ */ diff --git a/usr/src/uts/i86pc/sys/hma.h b/usr/src/uts/i86pc/sys/hma.h index 00009cf439..16ab708896 100644 --- a/usr/src/uts/i86pc/sys/hma.h +++ b/usr/src/uts/i86pc/sys/hma.h @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_HMA_H @@ -30,6 +30,40 @@ extern "C" { #endif + +/* + * Register a hypervisor with HMA. On success, a pointer to the opaque + * registration token will be returned, indicating that proper host setup has + * occurred for further hypervisor actions. + */ +typedef struct hma_reg hma_reg_t; +extern hma_reg_t *hma_register(const char *); +extern hma_reg_t *hma_register_exclusive(const char *); +extern void hma_unregister(hma_reg_t *); + +/* + * Allocate or free a VPID for use with VMX. + * + * This must not be performed by a hypervisor until it has successfully + * registered via hma_register(). + */ +extern uint16_t hma_vmx_vpid_alloc(void); +extern void hma_vmx_vpid_free(uint16_t); + +/* + * On all active CPUs, perform a single-context INVEPT on the given EPTP. + */ +extern void hma_vmx_invept_allcpus(uintptr_t); + +struct hma_svm_asid { + uint64_t hsa_gen; + uint32_t hsa_asid; +}; +typedef struct hma_svm_asid hma_svm_asid_t; + +extern void hma_svm_asid_init(hma_svm_asid_t *); +extern uint8_t hma_svm_asid_update(hma_svm_asid_t *, boolean_t, boolean_t); + /* * FPU related management. These functions provide a set of APIs to manage the * FPU state and switch between host and guest management of this state. @@ -96,6 +130,9 @@ extern void hma_fpu_stop_guest(hma_fpu_t *); extern void hma_fpu_get_fxsave_state(const hma_fpu_t *, struct fxsave_state *); extern int hma_fpu_set_fxsave_state(hma_fpu_t *, const struct fxsave_state *); +/* Perform HMA initialization steps during boot-up. */ +extern void hma_init(void); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h index f4e38dec98..772f3112cb 100644 --- a/usr/src/uts/i86pc/sys/machcpuvar.h +++ b/usr/src/uts/i86pc/sys/machcpuvar.h @@ -81,6 +81,12 @@ struct xen_evt_data { ulong_t evt_affinity[sizeof (ulong_t) * 8]; /* service on cpu */ }; +enum fast_syscall_state { + FSS_DISABLED = 0, + FSS_ASYSC_ENABLED = (1 << 0), + FSS_SEP_ENABLED = (1 << 1) +}; + struct kpti_frame { uint64_t kf_lower_redzone; @@ -214,6 +220,8 @@ struct machcpu { uint16_t mcpu_idle_type; /* CPU next idle type */ uint16_t max_cstates; /* supported max cstates */ + enum fast_syscall_state mcpu_fast_syscall_state; + struct cpu_ucode_info *mcpu_ucode_info; void *mcpu_pm_mach_state; diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h index 51d7559483..f79b582df4 100644 --- a/usr/src/uts/i86pc/sys/machparam.h +++ b/usr/src/uts/i86pc/sys/machparam.h @@ -31,14 +31,15 @@ #ifndef _SYS_MACHPARAM_H #define _SYS_MACHPARAM_H -#if !defined(_ASM) +#ifndef _ASM + #include <sys/types.h> #if defined(__xpv) #include <sys/xpv_impl.h> #endif -#endif +#endif /* !_ASM */ #ifdef __cplusplus extern "C" { @@ -54,17 +55,12 @@ extern "C" { * Machine dependent parameters and limits. */ -#if defined(__amd64) /* * If NCPU grows beyond 256, sizing for the x86 comm page will require * adjustment. */ #define NCPU 256 #define NCPU_LOG2 8 -#elif defined(__i386) -#define NCPU 32 -#define NCPU_LOG2 5 -#endif /* NCPU_P2 is NCPU rounded to a power of 2 */ #define NCPU_P2 (1 << NCPU_LOG2) @@ -116,11 +112,7 @@ extern "C" { /* * DEFAULT KERNEL THREAD stack size (in pages). */ -#if defined(__amd64) #define DEFAULTSTKSZ_NPGS 5 -#elif defined(__i386) -#define DEFAULTSTKSZ_NPGS 3 -#endif #if !defined(_ASM) #define DEFAULTSTKSZ (DEFAULTSTKSZ_NPGS * PAGESIZE) @@ -129,43 +121,42 @@ extern "C" { #endif /* !_ASM */ /* - * KERNELBASE is the virtual address at which the kernel segments start in - * all contexts. - * - * KERNELBASE is not fixed. The value of KERNELBASE can change with - * installed memory or on 32 bit systems the eprom variable 'eprom_kernelbase'. - * - * common/conf/param.c requires a compile time defined value for KERNELBASE. - * This value is save in the variable _kernelbase. _kernelbase may then be - * modified with to a different value in i86pc/os/startup.c. - * - * Most code should be using kernelbase, which resolves to a reference to - * _kernelbase. + * During intial boot we limit heap to the top 4Gig. */ -#define KERNEL_TEXT_amd64 UINT64_C(0xfffffffffb800000) - -#ifdef __i386 - -#define KERNEL_TEXT_i386 ADDRESS_C(0xfe800000) +#define BOOT_KERNELHEAP_BASE ADDRESS_C(0xffffffff00000000) /* - * We don't use HYPERVISOR_VIRT_START, as we need both the PAE and non-PAE - * versions in our code. We always compile based on the lower PAE address. + * VMWare works best if we don't use the top 64Meg of memory for amd64. + * Set KERNEL_TEXT to top_o_memory - 64Meg - 8 Meg for 8Meg of nucleus pages. */ -#define KERNEL_TEXT_i386_xpv \ - (HYPERVISOR_VIRT_START_PAE - 3 * ADDRESS_C(0x400000)) - -#endif /* __i386 */ +#define PROMSTART ADDRESS_C(0xffc00000) -#if defined(__amd64) +/* + * Virtual address range available to the debugger + */ +#define SEGDEBUGBASE ADDRESS_C(0xffffffffff800000) +#define SEGDEBUGSIZE ADDRESS_C(0x400000) -#define KERNELBASE ADDRESS_C(0xfffffd8000000000) +#define KERNEL_TEXT UINT64_C(0xfffffffffb800000) /* - * Size of the unmapped "red zone" at the very bottom of the kernel's - * address space. Corresponds to 1 slot in the toplevel pagetable. + * Reserve pages just below KERNEL_TEXT for the GDT, IDT, LDT, TSS and debug + * info. + * + * For now, DEBUG_INFO_VA must be first in this list for "xm" initiated dumps + * of solaris domUs to be usable with mdb. Relying on a fixed VA is not viable + * long term, but it's the best we've got for now. */ -#define KERNEL_REDZONE_SIZE ((uintptr_t)1 << 39) +#if !defined(_ASM) +#define DEBUG_INFO_VA (KERNEL_TEXT - MMU_PAGESIZE) +#define GDT_VA (DEBUG_INFO_VA - MMU_PAGESIZE) +#define IDT_VA (GDT_VA - MMU_PAGESIZE) +#define LDT_VA (IDT_VA - (16 * MMU_PAGESIZE)) +#define KTSS_VA (LDT_VA - MMU_PAGESIZE) +#define DFTSS_VA (KTSS_VA - MMU_PAGESIZE) +#define MISC_VA_BASE (DFTSS_VA) +#define MISC_VA_SIZE (KERNEL_TEXT - MISC_VA_BASE) +#endif /* !_ASM */ /* * Base of 'core' heap area, which is used for kernel and module text/data @@ -174,52 +165,47 @@ extern "C" { #define COREHEAP_BASE ADDRESS_C(0xffffffffc0000000) /* - * Beginning of the segkpm window. A lower value than this is used if - * physical addresses exceed 1TB. See i86pc/os/startup.c - */ -#define SEGKPM_BASE ADDRESS_C(0xfffffe0000000000) - -/* * This is valloc_base, above seg_kpm, but below everything else. * A lower value than this may be used if SEGKPM_BASE is adjusted. * See i86pc/os/startup.c */ -#define VALLOC_BASE ADDRESS_C(0xffffff0000000000) +#define VALLOC_BASE ADDRESS_C(0xfffffe0000000000) + +#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */ +#define SEGVMMMINSIZE (4096L * 1024 * 1024L) /* 4G */ -/* - * default and boundary sizes for segkp - */ #define SEGKPDEFSIZE (2L * 1024L * 1024L * 1024L) /* 2G */ #define SEGKPMAXSIZE (8L * 1024L * 1024L * 1024L) /* 8G */ #define SEGKPMINSIZE (200L * 1024 * 1024L) /* 200M */ -/* - * minimum size for segzio - */ -#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */ - -/* - * During intial boot we limit heap to the top 4Gig. - */ -#define BOOT_KERNELHEAP_BASE ADDRESS_C(0xffffffff00000000) +#define SEGKPM_BASE ADDRESS_C(0xfffffd0000000000) /* - * VMWare works best if we don't use the top 64Meg of memory for amd64. - * Set KERNEL_TEXT to top_o_memory - 64Meg - 8 Meg for 8Meg of nucleus pages. + * KERNELBASE is the virtual address at which the kernel segments start in + * all contexts. + * + * KERNELBASE is not fixed. The value of KERNELBASE can change with + * installed memory size. + * + * common/conf/param.c requires a compile time defined value for KERNELBASE. + * This value is save in the variable _kernelbase. _kernelbase may then be + * modified with to a different value in i86pc/os/startup.c. + * + * Most code should be using kernelbase, which resolves to a reference to + * _kernelbase. */ -#define PROMSTART ADDRESS_C(0xffc00000) -#define KERNEL_TEXT KERNEL_TEXT_amd64 +#define KERNELBASE ADDRESS_C(0xfffffc8000000000) /* - * Virtual address range available to the debugger + * Size of the unmapped "red zone" at the very bottom of the kernel's + * address space. Corresponds to 1 slot in the toplevel pagetable. */ -#define SEGDEBUGBASE ADDRESS_C(0xffffffffff800000) -#define SEGDEBUGSIZE ADDRESS_C(0x400000) +#define KERNEL_REDZONE_SIZE ((uintptr_t)1 << 39) /* * Define upper limit on user address space * - * In amd64, the upper limit on a 64-bit user address space is 1 large page + * The upper limit on a 64-bit user address space is 1 large page * (2MB) below kernelbase. The upper limit for a 32-bit user address space * is 1 small page (4KB) below the top of the 32-bit range. The 64-bit * limit give dtrace the red zone it needs below kernelbase. The 32-bit @@ -232,7 +218,7 @@ extern "C" { #if defined(__xpv) #define USERLIMIT ADDRESS_C(0x00007fffffe00000) #else -#define USERLIMIT ADDRESS_C(0xfffffd7fffe00000) +#define USERLIMIT ADDRESS_C(0xfffffc7fffe00000) #endif #ifdef bug_5074717_is_fixed @@ -241,76 +227,6 @@ extern "C" { #define USERLIMIT32 ADDRESS_C(0xfefff000) #endif -#elif defined(__i386) - -#ifdef DEBUG -#define KERNELBASE ADDRESS_C(0xc8000000) -#else -#define KERNELBASE ADDRESS_C(0xd4000000) -#endif - -#define KERNELBASE_MAX ADDRESS_C(0xe0000000) - -/* - * The i386 ABI requires that the user address space be at least 3Gb - * in size. KERNELBASE_ABI_MIN is used as the default KERNELBASE for - * physical memory configurations > 4gb. - */ -#define KERNELBASE_ABI_MIN ADDRESS_C(0xc0000000) - -/* - * Size of the unmapped "red zone" at the very bottom of the kernel's - * address space. Since segmap start immediately above the red zone, this - * needs to be MAXBSIZE aligned. - */ -#define KERNEL_REDZONE_SIZE MAXBSIZE - -/* - * This is the last 4MB of the 4G address space. Some psm modules - * need this region of virtual address space mapped 1-1 - * The top 64MB of the address space is reserved for the hypervisor. - */ -#define PROMSTART ADDRESS_C(0xffc00000) -#ifdef __xpv -#define KERNEL_TEXT KERNEL_TEXT_i386_xpv -#else -#define KERNEL_TEXT KERNEL_TEXT_i386 -#endif - -/* - * Virtual address range available to the debugger - * We place it just above the kernel text (4M) and kernel data (4M). - */ -#define SEGDEBUGBASE (KERNEL_TEXT + ADDRESS_C(0x800000)) -#define SEGDEBUGSIZE ADDRESS_C(0x400000) - -/* - * Define upper limit on user address space - */ -#define USERLIMIT KERNELBASE -#define USERLIMIT32 USERLIMIT - -#endif /* __i386 */ - -/* - * Reserve pages just below KERNEL_TEXT for the GDT, IDT, LDT, TSS and debug - * info. - * - * For now, DEBUG_INFO_VA must be first in this list for "xm" initiated dumps - * of solaris domUs to be usable with mdb. Relying on a fixed VA is not viable - * long term, but it's the best we've got for now. - */ -#if !defined(_ASM) -#define DEBUG_INFO_VA (KERNEL_TEXT - MMU_PAGESIZE) -#define GDT_VA (DEBUG_INFO_VA - MMU_PAGESIZE) -#define IDT_VA (GDT_VA - MMU_PAGESIZE) -#define LDT_VA (IDT_VA - (16 * MMU_PAGESIZE)) -#define KTSS_VA (LDT_VA - MMU_PAGESIZE) -#define DFTSS_VA (KTSS_VA - MMU_PAGESIZE) -#define MISC_VA_BASE (DFTSS_VA) -#define MISC_VA_SIZE (KERNEL_TEXT - MISC_VA_BASE) -#endif /* !_ASM */ - #if !defined(_ASM) && !defined(_KMDB) extern uintptr_t kernelbase, segmap_start, segmapsize; #endif diff --git a/usr/src/uts/i86pc/sys/machsystm.h b/usr/src/uts/i86pc/sys/machsystm.h index 7409c5af4a..5f286ca4c6 100644 --- a/usr/src/uts/i86pc/sys/machsystm.h +++ b/usr/src/uts/i86pc/sys/machsystm.h @@ -25,6 +25,7 @@ /* * Copyright (c) 2010, Intel Corporation. * All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_MACHSYSTM_H @@ -231,6 +232,7 @@ extern page_t *page_get_high_mfn(mfn_t); #endif extern hrtime_t tsc_gethrtime_tick_delta(void); +extern hrtime_t tsc_gethrtime_params(uint64_t *, uint32_t *, uint8_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/i86pc/sys/ppt_dev.h b/usr/src/uts/i86pc/sys/ppt_dev.h new file mode 100644 index 0000000000..e25f941f14 --- /dev/null +++ b/usr/src/uts/i86pc/sys/ppt_dev.h @@ -0,0 +1,56 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc + */ + +#ifndef _PPT_DEV_H +#define _PPT_DEV_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define PPT_IOC (('P' << 16)|('T' << 8)) + +#define PPT_CFG_READ (PPT_IOC | 0x01) +#define PPT_CFG_WRITE (PPT_IOC | 0x02) +#define PPT_BAR_QUERY (PPT_IOC | 0x03) +#define PPT_BAR_READ (PPT_IOC | 0x04) +#define PPT_BAR_WRITE (PPT_IOC | 0x05) + +#define PPT_MAXNAMELEN 32 + +struct ppt_cfg_io { + uint64_t pci_off; + uint32_t pci_width; + uint32_t pci_data; +}; +struct ppt_bar_io { + uint32_t pbi_bar; + uint32_t pbi_off; + uint32_t pbi_width; + uint32_t pbi_data; +}; + +struct ppt_bar_query { + uint32_t pbq_baridx; + uint32_t pbq_type; + uint64_t pbq_base; + uint64_t pbq_size; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _PPT_DEV_H */ diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h new file mode 100644 index 0000000000..46cc72eb06 --- /dev/null +++ b/usr/src/uts/i86pc/sys/viona_io.h @@ -0,0 +1,63 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VIONA_IO_H_ +#define _VIONA_IO_H_ + +#define VNA_IOC (('V' << 16)|('C' << 8)) +#define VNA_IOC_CREATE (VNA_IOC | 0x01) +#define VNA_IOC_DELETE (VNA_IOC | 0x02) + +#define VNA_IOC_RING_INIT (VNA_IOC | 0x10) +#define VNA_IOC_RING_RESET (VNA_IOC | 0x11) +#define VNA_IOC_RING_KICK (VNA_IOC | 0x12) +#define VNA_IOC_RING_SET_MSI (VNA_IOC | 0x13) +#define VNA_IOC_RING_INTR_CLR (VNA_IOC | 0x14) + +#define VNA_IOC_INTR_POLL (VNA_IOC | 0x20) +#define VNA_IOC_SET_FEATURES (VNA_IOC | 0x21) +#define VNA_IOC_GET_FEATURES (VNA_IOC | 0x22) +#define VNA_IOC_SET_NOTIFY_IOP (VNA_IOC | 0x23) + +typedef struct vioc_create { + datalink_id_t c_linkid; + int c_vmfd; +} vioc_create_t; + +typedef struct vioc_ring_init { + uint16_t ri_index; + uint16_t ri_qsize; + uint64_t ri_qaddr; +} vioc_ring_init_t; + +typedef struct vioc_ring_msi { + uint16_t rm_index; + uint64_t rm_addr; + uint64_t rm_msg; +} vioc_ring_msi_t; + +enum viona_vq_id { + VIONA_VQ_RX = 0, + VIONA_VQ_TX = 1, + VIONA_VQ_MAX = 2 +}; + +typedef struct vioc_intr_poll { + uint32_t vip_status[VIONA_VQ_MAX]; +} vioc_intr_poll_t; + + +#endif /* _VIONA_IO_H_ */ diff --git a/usr/src/uts/i86pc/sys/vm_machparam.h b/usr/src/uts/i86pc/sys/vm_machparam.h index 90a5245217..fde81e59ed 100644 --- a/usr/src/uts/i86pc/sys/vm_machparam.h +++ b/usr/src/uts/i86pc/sys/vm_machparam.h @@ -23,6 +23,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_VM_MACHPARAM_H @@ -133,7 +134,8 @@ extern "C" { /* * The maximum value for handspreadpages which is the the distance - * between the two clock hands in pages. + * between the two clock hands in pages. This is only used when the page + * scanner is first started. */ #define MAXHANDSPREADPAGES ((64 * 1024 * 1024) / PAGESIZE) diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h new file mode 100644 index 0000000000..ac8f14b042 --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -0,0 +1,748 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#include <sys/sdt.h> +#include <x86/segments.h> + +#ifdef _KERNEL +SDT_PROVIDER_DECLARE(vmm); +#endif + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, + VM_SUSPEND_LAST +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_DR7, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + VM_REG_GUEST_LDTR, + VM_REG_GUEST_TR, + VM_REG_GUEST_IDTR, + VM_REG_GUEST_GDTR, + VM_REG_GUEST_EFER, + VM_REG_GUEST_CR2, + VM_REG_GUEST_PDPTE0, + VM_REG_GUEST_PDPTE1, + VM_REG_GUEST_PDPTE2, + VM_REG_GUEST_PDPTE3, + VM_REG_GUEST_INTR_SHADOW, + VM_REG_GUEST_DR0, + VM_REG_GUEST_DR1, + VM_REG_GUEST_DR2, + VM_REG_GUEST_DR3, + VM_REG_GUEST_DR6, + VM_REG_LAST +}; + +enum x2apic_state { + X2APIC_DISABLED, + X2APIC_ENABLED, + X2APIC_STATE_LAST +}; + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + + +#define VM_MAX_NAMELEN 32 + +#ifdef _KERNEL + +struct vm; +struct vm_exception; +struct seg_desc; +struct vm_exit; +struct vm_run; +struct vhpet; +struct vioapic; +struct vlapic; +struct vmspace; +struct vm_object; +struct vm_guest_paging; +struct pmap; + +struct vm_eventinfo { + u_int *rptr; /* runblock cookie */ + int *sptr; /* suspend cookie */ + int *iptr; /* reqidle cookie */ +}; + +typedef int (*vmm_init_func_t)(int ipinum); +typedef int (*vmm_cleanup_func_t)(void); +typedef void (*vmm_resume_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct pmap *pmap, struct vm_eventinfo *info); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); +typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); +typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); +typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); +#ifndef __FreeBSD__ +typedef void (*vmi_savectx)(void *vmi, int vcpu); +typedef void (*vmi_restorectx)(void *vmi, int vcpu); +#endif + +struct vmm_ops { + vmm_init_func_t init; /* module wide initialization */ + vmm_cleanup_func_t cleanup; + vmm_resume_func_t resume; + + vmi_init_func_t vminit; /* vm-specific initialization */ + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_desc_t vmgetdesc; + vmi_set_desc_t vmsetdesc; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; + vmi_vmspace_alloc vmspace_alloc; + vmi_vmspace_free vmspace_free; + vmi_vlapic_init vlapic_init; + vmi_vlapic_cleanup vlapic_cleanup; + +#ifndef __FreeBSD__ + vmi_savectx vmsavectx; + vmi_restorectx vmrestorectx; +#endif +}; + +extern struct vmm_ops vmm_ops_intel; +extern struct vmm_ops vmm_ops_amd; + +int vm_create(const char *name, struct vm **retvm); +void vm_destroy(struct vm *vm); +int vm_reinit(struct vm *vm); +const char *vm_name(struct vm *vm); +uint16_t vm_get_maxcpus(struct vm *vm); +void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus); +int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus); + +/* + * APIs that modify the guest memory map require all vcpus to be frozen. + */ +int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot, int flags); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +void vm_free_memseg(struct vm *vm, int ident); +int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); +#ifdef __FreeBSD__ +int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); +int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); +#else +int vm_assign_pptdev(struct vm *vm, int pptfd); +int vm_unassign_pptdev(struct vm *vm, int pptfd); +#endif /* __FreeBSD__ */ + +/* + * APIs that inspect the guest memory map require only a *single* vcpu to + * be frozen. This acts like a read lock on the guest memory map since any + * modification requires *all* vcpus to be frozen. + */ +int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + struct vm_object **objptr); +vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); +void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, + int prot, void **cookie); +void vm_gpa_release(void *cookie); +bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); + +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *ret_desc); +int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc); +int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_suspend(struct vm *vm, enum vm_suspend_how how); +int vm_inject_nmi(struct vm *vm, int vcpu); +int vm_nmi_pending(struct vm *vm, int vcpuid); +void vm_nmi_clear(struct vm *vm, int vcpuid); +int vm_inject_extint(struct vm *vm, int vcpu); +int vm_extint_pending(struct vm *vm, int vcpuid); +void vm_extint_clear(struct vm *vm, int vcpuid); +struct vlapic *vm_lapic(struct vm *vm, int cpu); +struct vioapic *vm_ioapic(struct vm *vm); +struct vhpet *vm_hpet(struct vm *vm); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); +int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); +int vm_apicid2vcpuid(struct vm *vm, int apicid); +int vm_activate_cpu(struct vm *vm, int vcpu); +int vm_suspend_cpu(struct vm *vm, int vcpu); +int vm_resume_cpu(struct vm *vm, int vcpu); +struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); + +#ifdef _SYS__CPUSET_H_ +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_debug_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); +#endif /* _SYS__CPUSET_H_ */ + +static __inline int +vcpu_runblocked(struct vm_eventinfo *info) +{ + + return (*info->rptr != 0); +} + +static __inline int +vcpu_suspended(struct vm_eventinfo *info) +{ + + return (*info->sptr); +} + +static __inline int +vcpu_reqidle(struct vm_eventinfo *info) +{ + + return (*info->iptr); +} + +int vcpu_debugged(struct vm *vm, int vcpuid); + +/* + * Return 1 if device indicated by bus/slot/func is supposed to be a + * pci passthrough device. + * + * Return 0 otherwise. + */ +int vmm_is_pptdev(int bus, int slot, int func); + +void *vm_iommu_domain(struct vm *vm); + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, + bool from_idle); +enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); +void vcpu_block_run(struct vm *, int); +void vcpu_unblock_run(struct vm *, int); + +#ifndef __FreeBSD__ +uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid); +#endif + +static __inline int +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +#ifdef _SYS_THREAD_H +static __inline int +vcpu_should_yield(struct vm *vm, int vcpu) +{ + + if (curthread->t_astflag) + return (1); + else if (CPU->cpu_runrun) + return (1); + else + return (0); +} +#endif /* _SYS_THREAD_H */ + +void *vcpu_stats(struct vm *vm, int vcpu); +void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); +struct vmspace *vm_get_vmspace(struct vm *vm); +struct vatpic *vm_atpic(struct vm *vm); +struct vatpit *vm_atpit(struct vm *vm); +struct vpmtmr *vm_pmtmr(struct vm *vm); +struct vrtc *vm_rtc(struct vm *vm); + +/* + * Inject exception 'vector' into the guest vcpu. This function returns 0 on + * success and non-zero on failure. + * + * Wrapper functions like 'vm_inject_gp()' should be preferred to calling + * this function directly because they enforce the trap-like or fault-like + * behavior of an exception. + * + * This function should only be called in the context of the thread that is + * executing this vcpu. + */ +int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, + uint32_t errcode, int restart_instruction); + +/* + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. + * + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); + +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +/* + * Set up 'copyinfo[]' to copy to/from guest linear address space starting + * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for + * a copyin or PROT_WRITE for a copyout. + * + * retval is_fault Interpretation + * 0 0 Success + * 0 1 An exception was injected into the guest + * EFAULT N/A Unrecoverable error + * + * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if + * the return value is 0. The 'copyinfo[]' resources should be freed by calling + * 'vm_copy_teardown()' after the copy is done. + */ +int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo, int *is_fault); +void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo); +void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + void *kaddr, size_t len); +void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len); + +int vcpu_trace_exceptions(struct vm *vm, int vcpuid); +#endif /* KERNEL */ + +#define VM_MAXCPU 32 /* maximum virtual cpus */ + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_ENABLE_INVPCID, + VM_CAP_MAX +}; + +enum vm_intr_trigger { + EDGE_TRIGGER, + LEVEL_TRIGGER +}; + +/* + * The 'access' field has the format specified in Table 21-2 of the Intel + * Architecture Manual vol 3b. + * + * XXX The contents of the 'access' field are architecturally defined except + * bit 16 - Segment Unusable. + */ +struct seg_desc { + uint64_t base; + uint32_t limit; + uint32_t access; +}; +#define SEG_DESC_TYPE(access) ((access) & 0x001f) +#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) +#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) +#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) +#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) +#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) + +enum vm_cpu_mode { + CPU_MODE_REAL, + CPU_MODE_PROTECTED, + CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ + CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ +}; + +enum vm_paging_mode { + PAGING_MODE_FLAT, + PAGING_MODE_32, + PAGING_MODE_PAE, + PAGING_MODE_64, +}; + +struct vm_guest_paging { + uint64_t cr3; + int cpl; + enum vm_cpu_mode cpu_mode; + enum vm_paging_mode paging_mode; +}; + +/* + * The data structures 'vie' and 'vie_op' are meant to be opaque to the + * consumers of instruction decoding. The only reason why their contents + * need to be exposed is because they are part of the 'vm_exit' structure. + */ +struct vie_op { + uint8_t op_byte; /* actual opcode byte */ + uint8_t op_type; /* type of operation (e.g. MOV) */ + uint16_t op_flags; +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ + uint8_t num_valid; /* size of the instruction */ + uint8_t num_processed; + + uint8_t addrsize:4, opsize:4; /* address and operand sizes */ + uint8_t rex_w:1, /* REX prefix */ + rex_r:1, + rex_x:1, + rex_b:1, + rex_present:1, + repz_present:1, /* REP/REPE/REPZ prefix */ + repnz_present:1, /* REPNE/REPNZ prefix */ + opsize_override:1, /* Operand size override */ + addrsize_override:1, /* Address size override */ + segment_override:1; /* Segment override */ + + uint8_t mod:2, /* ModRM byte */ + reg:4, + rm:4; + + uint8_t ss:2, /* SIB byte */ + index:4, + base:4; + + uint8_t disp_bytes; + uint8_t imm_bytes; + + uint8_t scale; + int base_register; /* VM_REG_GUEST_xyz */ + int index_register; /* VM_REG_GUEST_xyz */ + int segment_register; /* VM_REG_GUEST_xyz */ + + int64_t displacement; /* optional addr displacement */ + int64_t immediate; /* optional immediate operand */ + + uint8_t decoded; /* set to 1 if successfully decoded */ + + struct vie_op op; /* opcode description */ +}; + +enum vm_exitcode { + VM_EXITCODE_INOUT, + VM_EXITCODE_VMX, + VM_EXITCODE_BOGUS, + VM_EXITCODE_RDMSR, + VM_EXITCODE_WRMSR, + VM_EXITCODE_HLT, + VM_EXITCODE_MTRAP, + VM_EXITCODE_PAUSE, + VM_EXITCODE_PAGING, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_SPINUP_AP, + VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ + VM_EXITCODE_RUNBLOCK, + VM_EXITCODE_IOAPIC_EOI, + VM_EXITCODE_SUSPENDED, + VM_EXITCODE_INOUT_STR, + VM_EXITCODE_TASK_SWITCH, + VM_EXITCODE_MONITOR, + VM_EXITCODE_MWAIT, + VM_EXITCODE_SVM, + VM_EXITCODE_REQIDLE, + VM_EXITCODE_DEBUG, + VM_EXITCODE_VMINSN, +#ifndef __FreeBSD__ + VM_EXITCODE_HT, +#endif + VM_EXITCODE_MAX +}; + +struct vm_inout { + uint16_t bytes:3; /* 1 or 2 or 4 */ + uint16_t in:1; + uint16_t string:1; + uint16_t rep:1; + uint16_t port; + uint32_t eax; /* valid for out */ +}; + +struct vm_inout_str { + struct vm_inout inout; /* must be the first element */ + struct vm_guest_paging paging; + uint64_t rflags; + uint64_t cr0; + uint64_t index; + uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ + int addrsize; + enum vm_reg_name seg_name; + struct seg_desc seg_desc; +}; + +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; + struct vm_guest_paging paging; +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; /* 0 means unknown */ + uint64_t rip; + union { + struct vm_inout inout; + struct vm_inout_str inout_str; + struct { + uint64_t gpa; + int fault_type; + } paging; + struct { + uint64_t gpa; + uint64_t gla; + uint64_t cs_base; + int cs_d; /* CS.D */ + struct vm_guest_paging paging; + struct vie vie; + } inst_emul; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int status; /* vmx inst status */ + /* + * 'exit_reason' and 'exit_qualification' are valid + * only if 'status' is zero. + */ + uint32_t exit_reason; + uint64_t exit_qualification; + /* + * 'inst_error' and 'inst_type' are valid + * only if 'status' is non-zero. + */ + int inst_type; + int inst_error; + } vmx; + /* + * SVM specific payload. + */ + struct { + uint64_t exitcode; + uint64_t exitinfo1; + uint64_t exitinfo2; + } svm; + struct { + uint32_t code; /* ecx value */ + uint64_t wval; + } msr; + struct { + int vcpu; + uint64_t rip; + } spinup_ap; + struct { + uint64_t rflags; + uint64_t intr_status; + } hlt; + struct { + int vector; + } ioapic_eoi; + struct { + enum vm_suspend_how how; + } suspended; + struct vm_task_switch task_switch; + } u; +}; + +/* APIs to inject faults into the guest */ +void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, + int errcode); + +static __inline void +vm_inject_ud(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); +} + +static __inline void +vm_inject_gp(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); +} + +static __inline void +vm_inject_ac(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); +} + +static __inline void +vm_inject_ss(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); +} + +void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); + +int vm_restart_instruction(void *vm, int vcpuid); + +#ifndef __FreeBSD__ +#ifdef _KERNEL + +void vmm_sol_glue_init(void); +void vmm_sol_glue_cleanup(void); + +int vmm_mod_load(void); +int vmm_mod_unload(void); + +void vmm_call_trap(uint64_t); + +/* + * Because of tangled headers, these are mirrored by vmm_drv.h to present the + * interface to driver consumers. + */ +typedef int (*vmm_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *); +typedef int (*vmm_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t); + +int vm_ioport_hook(struct vm *, uint_t, vmm_rmem_cb_t, vmm_wmem_cb_t, void *, + void **); +void vm_ioport_unhook(struct vm *, void **); +int vm_ioport_handle_hook(struct vm *, int, bool, int, int, uint32_t *); + +#endif /* _KERNEL */ +#endif /* __FreeBSD */ + +#endif /* _VMM_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h new file mode 100644 index 0000000000..dd87dcb0a6 --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -0,0 +1,520 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#include <machine/vmm.h> + +struct vm_memmap { + vm_paddr_t gpa; + int segid; /* memory segment */ + vm_ooffset_t segoff; /* offset into memory segment */ + size_t len; /* mmap length */ + int prot; /* RWX */ + int flags; +}; +#define VM_MEMMAP_F_WIRED 0x01 +#define VM_MEMMAP_F_IOMMU 0x02 + +#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) +struct vm_memseg { + int segid; + size_t len; + char name[SPECNAMELEN + 1]; +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_seg_desc { /* data or code segment */ + int cpuid; + int regnum; /* enum vm_reg_name */ + struct seg_desc desc; +}; + +struct vm_register_set { + int cpuid; + unsigned int count; + const int *regnums; /* enum vm_reg_name */ + uint64_t *regvals; +}; + +struct vm_run { + int cpuid; + struct vm_exit vm_exit; +}; + +struct vm_exception { + int cpuid; + int vector; + uint32_t error_code; + int error_code_valid; + int restart_instruction; +}; + +struct vm_lapic_msi { + uint64_t msg; + uint64_t addr; +}; + +struct vm_lapic_irq { + int cpuid; + int vector; +}; + +struct vm_ioapic_irq { + int irq; +}; + +struct vm_isa_irq { + int atpic_irq; + int ioapic_irq; +}; + +struct vm_isa_irq_trigger { + int atpic_irq; + enum vm_intr_trigger trigger; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +#ifdef __FreeBSD__ +struct vm_pptdev { + int bus; + int slot; + int func; +}; + +struct vm_pptdev_mmio { + int bus; + int slot; + int func; + vm_paddr_t gpa; + vm_paddr_t hpa; + size_t len; +}; + +struct vm_pptdev_msi { + int vcpu; + int bus; + int slot; + int func; + int numvec; /* 0 means disabled */ + uint64_t msg; + uint64_t addr; +}; + +struct vm_pptdev_msix { + int vcpu; + int bus; + int slot; + int func; + int idx; + uint64_t msg; + uint32_t vector_control; + uint64_t addr; +}; + +struct vm_pptdev_limits { + int bus; + int slot; + int func; + int msi_limit; + int msix_limit; +}; +#else /* __FreeBSD__ */ +struct vm_pptdev { + int pptfd; +}; + +struct vm_pptdev_mmio { + int pptfd; + vm_paddr_t gpa; + vm_paddr_t hpa; + size_t len; +}; + +struct vm_pptdev_msi { + int vcpu; + int pptfd; + int numvec; /* 0 means disabled */ + uint64_t msg; + uint64_t addr; +}; + +struct vm_pptdev_msix { + int vcpu; + int pptfd; + int idx; + uint64_t msg; + uint32_t vector_control; + uint64_t addr; +}; + +struct vm_pptdev_limits { + int pptfd; + int msi_limit; + int msix_limit; +}; +#endif /* __FreeBSD__ */ + +struct vm_nmi { + int cpuid; +}; + +#ifdef __FreeBSD__ +#define MAX_VM_STATS 64 +#else +#define MAX_VM_STATS (64 + VM_MAXCPU) +#endif + +struct vm_stats { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; + +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +struct vm_x2apic { + int cpuid; + enum x2apic_state state; +}; + +struct vm_gpa_pte { + uint64_t gpa; /* in */ + uint64_t pte[4]; /* out */ + int ptenum; +}; + +struct vm_hpet_cap { + uint32_t capabilities; /* lower 32 bits of HPET capabilities */ +}; + +struct vm_suspend { + enum vm_suspend_how how; +}; + +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + struct vm_guest_paging paging; + int fault; /* outputs */ + uint64_t gpa; +}; + +struct vm_activate_cpu { + int vcpuid; +}; + +struct vm_cpuset { + int which; + int cpusetsize; +#ifndef _KERNEL + cpuset_t *cpus; +#else + void *cpus; +#endif +}; +#define VM_ACTIVE_CPUS 0 +#define VM_SUSPENDED_CPUS 1 +#define VM_DEBUG_CPUS 2 + +struct vm_intinfo { + int vcpuid; + uint64_t info1; + uint64_t info2; +}; + +struct vm_rtc_time { + time_t secs; +}; + +struct vm_rtc_data { + int offset; + uint8_t value; +}; + +#ifndef __FreeBSD__ +struct vm_devmem_offset { + int segid; + off_t offset; +}; +#endif + +struct vm_cpu_topology { + uint16_t sockets; + uint16_t cores; + uint16_t threads; + uint16_t maxcpus; +}; + +enum { + /* general routines */ + IOCNUM_ABIVERS = 0, + IOCNUM_RUN = 1, + IOCNUM_SET_CAPABILITY = 2, + IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, + IOCNUM_REINIT = 5, + + /* memory apis */ + IOCNUM_MAP_MEMORY = 10, /* deprecated */ + IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */ + IOCNUM_GET_GPA_PMAP = 12, + IOCNUM_GLA2GPA = 13, + IOCNUM_ALLOC_MEMSEG = 14, + IOCNUM_GET_MEMSEG = 15, + IOCNUM_MMAP_MEMSEG = 16, + IOCNUM_MMAP_GETNEXT = 17, + IOCNUM_GLA2GPA_NOFAULT = 18, + + /* register/state accessors */ + IOCNUM_SET_REGISTER = 20, + IOCNUM_GET_REGISTER = 21, + IOCNUM_SET_SEGMENT_DESCRIPTOR = 22, + IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, + IOCNUM_SET_REGISTER_SET = 24, + IOCNUM_GET_REGISTER_SET = 25, + + /* interrupt injection */ + IOCNUM_GET_INTINFO = 28, + IOCNUM_SET_INTINFO = 29, + IOCNUM_INJECT_EXCEPTION = 30, + IOCNUM_LAPIC_IRQ = 31, + IOCNUM_INJECT_NMI = 32, + IOCNUM_IOAPIC_ASSERT_IRQ = 33, + IOCNUM_IOAPIC_DEASSERT_IRQ = 34, + IOCNUM_IOAPIC_PULSE_IRQ = 35, + IOCNUM_LAPIC_MSI = 36, + IOCNUM_LAPIC_LOCAL_IRQ = 37, + IOCNUM_IOAPIC_PINCOUNT = 38, + IOCNUM_RESTART_INSTRUCTION = 39, + + /* PCI pass-thru */ + IOCNUM_BIND_PPTDEV = 40, + IOCNUM_UNBIND_PPTDEV = 41, + IOCNUM_MAP_PPTDEV_MMIO = 42, + IOCNUM_PPTDEV_MSI = 43, + IOCNUM_PPTDEV_MSIX = 44, + IOCNUM_GET_PPTDEV_LIMITS = 45, + + /* statistics */ + IOCNUM_VM_STATS = 50, + IOCNUM_VM_STAT_DESC = 51, + + /* kernel device state */ + IOCNUM_SET_X2APIC_STATE = 60, + IOCNUM_GET_X2APIC_STATE = 61, + IOCNUM_GET_HPET_CAPABILITIES = 62, + + /* CPU Topology */ + IOCNUM_SET_TOPOLOGY = 63, + IOCNUM_GET_TOPOLOGY = 64, + + /* legacy interrupt injection */ + IOCNUM_ISA_ASSERT_IRQ = 80, + IOCNUM_ISA_DEASSERT_IRQ = 81, + IOCNUM_ISA_PULSE_IRQ = 82, + IOCNUM_ISA_SET_IRQ_TRIGGER = 83, + + /* vm_cpuset */ + IOCNUM_ACTIVATE_CPU = 90, + IOCNUM_GET_CPUSET = 91, + IOCNUM_SUSPEND_CPU = 92, + IOCNUM_RESUME_CPU = 93, + + /* RTC */ + IOCNUM_RTC_READ = 100, + IOCNUM_RTC_WRITE = 101, + IOCNUM_RTC_SETTIME = 102, + IOCNUM_RTC_GETTIME = 103, + +#ifndef __FreeBSD__ + /* illumos-custom ioctls */ + IOCNUM_DEVMEM_GETOFFSET = 256, + IOCNUM_WRLOCK_CYCLE = 257, +#endif +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SUSPEND \ + _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) +#define VM_REINIT \ + _IO('v', IOCNUM_REINIT) +#define VM_ALLOC_MEMSEG \ + _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) +#define VM_GET_MEMSEG \ + _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) +#define VM_MMAP_MEMSEG \ + _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) +#define VM_MMAP_GETNEXT \ + _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_SEGMENT_DESCRIPTOR \ + _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_GET_SEGMENT_DESCRIPTOR \ + _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_SET_REGISTER_SET \ + _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set) +#define VM_GET_REGISTER_SET \ + _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set) +#define VM_INJECT_EXCEPTION \ + _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) +#define VM_LAPIC_IRQ \ + _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) +#define VM_LAPIC_LOCAL_IRQ \ + _IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq) +#define VM_LAPIC_MSI \ + _IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi) +#define VM_IOAPIC_ASSERT_IRQ \ + _IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_DEASSERT_IRQ \ + _IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_PULSE_IRQ \ + _IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_PINCOUNT \ + _IOR('v', IOCNUM_IOAPIC_PINCOUNT, int) +#define VM_ISA_ASSERT_IRQ \ + _IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq) +#define VM_ISA_DEASSERT_IRQ \ + _IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq) +#define VM_ISA_PULSE_IRQ \ + _IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq) +#define VM_ISA_SET_IRQ_TRIGGER \ + _IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_BIND_PPTDEV \ + _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) +#define VM_UNBIND_PPTDEV \ + _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) +#define VM_MAP_PPTDEV_MMIO \ + _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) +#define VM_PPTDEV_MSI \ + _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) +#define VM_PPTDEV_MSIX \ + _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) +#define VM_GET_PPTDEV_LIMITS \ + _IOR('v', IOCNUM_GET_PPTDEV_LIMITS, struct vm_pptdev_limits) +#define VM_INJECT_NMI \ + _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) +#define VM_STATS_IOC \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#define VM_SET_X2APIC_STATE \ + _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic) +#define VM_GET_X2APIC_STATE \ + _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) +#define VM_GET_HPET_CAPABILITIES \ + _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap) +#define VM_SET_TOPOLOGY \ + _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GET_TOPOLOGY \ + _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GET_GPA_PMAP \ + _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) +#define VM_GLA2GPA \ + _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) +#define VM_GLA2GPA_NOFAULT \ + _IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa) +#define VM_ACTIVATE_CPU \ + _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) +#define VM_GET_CPUS \ + _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_SUSPEND_CPU \ + _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu) +#define VM_RESUME_CPU \ + _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu) +#define VM_SET_INTINFO \ + _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) +#define VM_GET_INTINFO \ + _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) +#define VM_RTC_WRITE \ + _IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data) +#define VM_RTC_READ \ + _IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data) +#define VM_RTC_SETTIME \ + _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time) +#define VM_RTC_GETTIME \ + _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) +#define VM_RESTART_INSTRUCTION \ + _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) + +#ifndef __FreeBSD__ +#define VM_DEVMEM_GETOFFSET \ + _IOW('v', IOCNUM_DEVMEM_GETOFFSET, struct vm_devmem_offset) +#define VM_WRLOCK_CYCLE _IO('v', IOCNUM_WRLOCK_CYCLE) + +/* ioctls used against ctl device for vm create/destroy */ +#define VMM_IOC_BASE (('V' << 16) | ('M' << 8)) +#define VMM_CREATE_VM (VMM_IOC_BASE | 0x01) +#define VMM_DESTROY_VM (VMM_IOC_BASE | 0x02) +#define VMM_VM_SUPPORTED (VMM_IOC_BASE | 0x03) + +#define VMM_CTL_DEV "/dev/vmmctl" + +#endif + +#endif diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h new file mode 100644 index 0000000000..856b75e5cc --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_drv.h @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VMM_DRV_H_ +#define _VMM_DRV_H_ + +#ifdef _KERNEL + +#include <sys/file.h> + +struct vmm_hold; +typedef struct vmm_hold vmm_hold_t; + +struct vmm_lease; +typedef struct vmm_lease vmm_lease_t; + +/* + * Because of tangled headers, these definitions mirror their vmm_[rw]mem_cb_t + * counterparts in vmm.h. + */ +typedef int (*vmm_drv_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *); +typedef int (*vmm_drv_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t); + +extern int vmm_drv_hold(file_t *, cred_t *, vmm_hold_t **); +extern void vmm_drv_rele(vmm_hold_t *); +extern boolean_t vmm_drv_release_reqd(vmm_hold_t *); + +extern vmm_lease_t *vmm_drv_lease_sign(vmm_hold_t *, boolean_t (*)(void *), + void *); +extern void vmm_drv_lease_break(vmm_hold_t *, vmm_lease_t *); +extern boolean_t vmm_drv_lease_expired(vmm_lease_t *); + +extern void *vmm_drv_gpa2kva(vmm_lease_t *, uintptr_t, size_t); +extern int vmm_drv_msi(vmm_lease_t *, uint64_t, uint64_t); + +extern int vmm_drv_ioport_hook(vmm_hold_t *, uint_t, vmm_drv_rmem_cb_t, + vmm_drv_wmem_cb_t, void *, void **); +extern void vmm_drv_ioport_unhook(vmm_hold_t *, void **); +#endif /* _KERNEL */ + +#endif /* _VMM_DRV_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h new file mode 100644 index 0000000000..cdc56cc464 --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_impl.h @@ -0,0 +1,89 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VMM_IMPL_H_ +#define _VMM_IMPL_H_ + +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/varargs.h> +#include <sys/zone.h> + +#ifdef _KERNEL + +#define VMM_CTL_MINOR 0 + +/* + * Rather than creating whole character devices for devmem mappings, they are + * available by mmap(2)ing the vmm handle at a specific offset. These offsets + * begin just above the maximum allow guest physical address. + */ +#include <vm/vm_param.h> +#define VM_DEVMEM_START (VM_MAXUSER_ADDRESS + 1) + +struct vmm_devmem_entry { + list_node_t vde_node; + int vde_segid; + char vde_name[SPECNAMELEN + 1]; + size_t vde_len; + off_t vde_off; +}; +typedef struct vmm_devmem_entry vmm_devmem_entry_t; + +typedef struct vmm_zsd vmm_zsd_t; + +enum vmm_softc_state { + VMM_HELD = 1, /* external driver(s) possess hold on the VM */ + VMM_CLEANUP = 2, /* request that holds are released */ + VMM_PURGED = 4, /* all hold have been released */ + VMM_BLOCK_HOOK = 8, /* mem hook install temporarily blocked */ + VMM_DESTROY = 16 /* VM is destroyed, softc still around */ +}; + +struct vmm_softc { + list_node_t vmm_node; + struct vm *vmm_vm; + minor_t vmm_minor; + char vmm_name[VM_MAX_NAMELEN]; + list_t vmm_devmem_list; + + kcondvar_t vmm_cv; + list_t vmm_holds; + uint_t vmm_flags; + boolean_t vmm_is_open; + + kmutex_t vmm_lease_lock; + list_t vmm_lease_list; + uint_t vmm_lease_blocker; + kcondvar_t vmm_lease_cv; + krwlock_t vmm_rwlock; + + /* For zone specific data */ + list_node_t vmm_zsd_linkage; + zone_t *vmm_zone; + vmm_zsd_t *vmm_zsd; +}; +typedef struct vmm_softc vmm_softc_t; + +void vmm_zsd_init(void); +void vmm_zsd_fini(void); +int vmm_zsd_add_vm(vmm_softc_t *sc); +void vmm_zsd_rem_vm(vmm_softc_t *sc); +int vmm_do_vm_destroy(vmm_softc_t *, boolean_t); + +#endif /* _KERNEL */ + +#endif /* _VMM_IMPL_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h new file mode 100644 index 0000000000..f10f407164 --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h @@ -0,0 +1,137 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +#include <sys/mman.h> + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); + +typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Emulate the decoded 'vie' instruction. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * s + */ +int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t mrr, + mem_region_write_t mrw, void *mrarg); + +int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size); + +/* + * Returns 1 if an alignment check exception should be injected and 0 otherwise. + */ +int vie_alignment_check(int cpl, int operand_size, uint64_t cr0, + uint64_t rflags, uint64_t gla); + +/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */ +int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); + +uint64_t vie_size2mask(int size); + +int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot, + uint64_t *gla); + +#ifdef _KERNEL +/* + * APIs to fetch and decode the instruction from nested page fault handler. + * + * 'vie' must be initialized before calling 'vmm_fetch_instruction()' + */ +int vmm_fetch_instruction(struct vm *vm, int cpuid, + struct vm_guest_paging *guest_paging, + uint64_t rip, int inst_length, struct vie *vie, + int *is_fault); + +/* + * Translate the guest linear address 'gla' to a guest physical address. + * + * retval is_fault Interpretation + * 0 0 'gpa' contains result of the translation + * 0 1 An exception was injected into the guest + * EFAULT N/A An unrecoverable hypervisor error occurred + */ +int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +/* + * Like vm_gla2gpa, but no exceptions are injected into the guest and + * PTEs are not changed. + */ +int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); + +/* + * Decode the instruction fetched into 'vie' so it can be emulated. + * + * 'gla' is the guest linear address provided by the hardware assist + * that caused the nested page table fault. It is used to verify that + * the software instruction decoding is in agreement with the hardware. + * + * Some hardware assists do not provide the 'gla' to the hypervisor. + * To skip the 'gla' verification for this or any other reason pass + * in VIE_INVALID_GLA instead. + */ +#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ +int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, + enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); +#endif /* _KERNEL */ + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/usr/src/uts/i86pc/unix/Makefile b/usr/src/uts/i86pc/unix/Makefile index 933b36a938..188d72c6f8 100644 --- a/usr/src/uts/i86pc/unix/Makefile +++ b/usr/src/uts/i86pc/unix/Makefile @@ -167,8 +167,8 @@ clobber: $(CLOBBER_DEPS) install: $(INSTALL_DEPS) -MAPFILE_32 = $(MAPFILE) -MAPFILE_64 = $(MAPFILE).amd64 +MAPFILE_32 = $(UNIX_MAPFILE) +MAPFILE_64 = $(UNIX_MAPFILE).amd64 MAPFILE_NAME = $(MAPFILE_$(CLASS)) diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile new file mode 100644 index 0000000000..dac59c9a45 --- /dev/null +++ b/usr/src/uts/i86pc/viona/Makefile @@ -0,0 +1,88 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = viona +OBJECTS = $(VIONA_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/i86pc/io/viona +MAPFILE = $(UTSBASE)/i86pc/io/viona/viona.mapfile + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Overrides +# +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV +LINTTAGS += -erroff=E_FUNC_ARG_UNUSED +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_FUNC_RET_MAYBE_IGNORED2 +LINTTAGS += -erroff=E_FUNC_RET_ALWAYS_IGNOR2 + +# needs work +SMOFF += all_func_returns + +ALL_BUILDS = $(ALL_BUILDSONLY64) +DEF_BUILDS = $(DEF_BUILDSONLY64) + +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm -Nmisc/neti +LDFLAGS += -Nmisc/hook +LDFLAGS += -M $(MAPFILE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index 6f296841ea..2bac383b9c 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -3917,7 +3917,7 @@ hat_page_getattr(struct page *pp, uint_t flag) /* - * common code used by hat_pageunload() and hment_steal() + * common code used by hat_page_inval() and hment_steal() */ hment_t * hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry) @@ -3973,15 +3973,13 @@ hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry) extern int vpm_enable; /* - * Unload all translations to a page. If the page is a subpage of a large + * Unload translations to a page. If the page is a subpage of a large * page, the large page mappings are also removed. - * - * The forceflags are unused. + * If curhat is not NULL, then we only unload the translation + * for the given process, otherwise all translations are unloaded. */ - -/*ARGSUSED*/ -static int -hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) +void +hat_page_inval(struct page *pp, uint_t pg_szcd, struct hat *curhat) { page_t *cur_pp = pp; hment_t *hm; @@ -3989,15 +3987,10 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) htable_t *ht; uint_t entry; level_t level; + ulong_t cnt; XPV_DISALLOW_MIGRATE(); - /* - * prevent recursion due to kmem_free() - */ - ++curthread->t_hatdepth; - ASSERT(curthread->t_hatdepth < 16); - #if defined(__amd64) /* * clear the vpm ref. @@ -4010,6 +4003,8 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) * The loop with next_size handles pages with multiple pagesize mappings */ next_size: + if (curhat != NULL) + cnt = hat_page_getshare(cur_pp); for (;;) { /* @@ -4021,14 +4016,13 @@ next_size: if (hm == NULL) { x86_hm_exit(cur_pp); +curproc_done: /* * If not part of a larger page, we're done. */ if (cur_pp->p_szc <= pg_szcd) { - ASSERT(curthread->t_hatdepth > 0); - --curthread->t_hatdepth; XPV_ALLOW_MIGRATE(); - return (0); + return; } /* @@ -4047,8 +4041,20 @@ next_size: * If this mapping size matches, remove it. */ level = ht->ht_level; - if (level == pg_szcd) - break; + if (level == pg_szcd) { + if (curhat == NULL || ht->ht_hat == curhat) + break; + /* + * Unloading only the given process but it's + * not the hat for the current process. Leave + * entry in place. Also do a safety check to + * ensure we don't get in an infinite loop + */ + if (cnt-- == 0) { + x86_hm_exit(cur_pp); + goto curproc_done; + } + } } /* @@ -4058,14 +4064,44 @@ next_size: hm = hati_page_unmap(cur_pp, ht, entry); if (hm != NULL) hment_free(hm); + + /* Perform check above for being part of a larger page. */ + if (curhat != NULL) + goto curproc_done; } } +/* + * Unload translations to a page. If unloadflag is HAT_CURPROC_PGUNLOAD, then + * we only unload the translation for the current process, otherwise all + * translations are unloaded. + */ +static int +hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t unloadflag) +{ + struct hat *curhat = NULL; + + /* + * prevent recursion due to kmem_free() + */ + ++curthread->t_hatdepth; + ASSERT(curthread->t_hatdepth < 16); + + if (unloadflag == HAT_CURPROC_PGUNLOAD) + curhat = curthread->t_procp->p_as->a_hat; + + hat_page_inval(pp, pg_szcd, curhat); + + ASSERT(curthread->t_hatdepth > 0); + --curthread->t_hatdepth; + return (0); +} + int -hat_pageunload(struct page *pp, uint_t forceflag) +hat_pageunload(struct page *pp, uint_t unloadflag) { ASSERT(PAGE_EXCL(pp)); - return (hati_pageunload(pp, 0, forceflag)); + return (hati_pageunload(pp, 0, unloadflag)); } /* diff --git a/usr/src/uts/i86pc/vm/hment.c b/usr/src/uts/i86pc/vm/hment.c index d00d756828..079f64e92e 100644 --- a/usr/src/uts/i86pc/vm/hment.c +++ b/usr/src/uts/i86pc/vm/hment.c @@ -21,10 +21,9 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h> @@ -37,6 +36,7 @@ #include <vm/hat_i86.h> #include <sys/cmn_err.h> #include <sys/avl.h> +#include <sys/zone.h> /* @@ -323,6 +323,8 @@ hment_insert(hment_t *hm, page_t *pp) ((hment_t *)pp->p_mapping)->hm_prev = hm; pp->p_mapping = hm; + zone_add_page(pp); + /* * Add the hment to the system-wide hash table. */ @@ -464,6 +466,7 @@ hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) pp->p_embed = 1; pp->p_mapping = htable; pp->p_mlentry = entry; + zone_add_page(pp); return; } @@ -545,6 +548,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) pp->p_mapping = NULL; pp->p_mlentry = 0; pp->p_embed = 0; + zone_rm_page(pp); return (NULL); } @@ -580,6 +584,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) hm->hm_hashlink = null_avl_link; hm->hm_next = NULL; hm->hm_prev = NULL; + zone_rm_page(pp); return (hm); } diff --git a/usr/src/uts/i86pc/vm/seg_vmm.c b/usr/src/uts/i86pc/vm/seg_vmm.c new file mode 100644 index 0000000000..beb5e81d53 --- /dev/null +++ b/usr/src/uts/i86pc/vm/seg_vmm.c @@ -0,0 +1,471 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * segvmm - Virtual-Machine-Memory segment + * + * The vmm segment driver was designed for mapping regions of kernel memory + * allocated to an HVM instance into userspace for manipulation there. It + * draws direct lineage from the umap segment driver, but meant for larger + * mappings with fewer restrictions. + * + * seg*k*vmm, in contrast, has mappings for every VMM into kas. We use its + * mappings here only to find the relevant PFNs in segvmm_fault_in(). + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/lgrp.h> +#include <sys/mman.h> + +#include <vm/hat.h> +#include <vm/hat_pte.h> +#include <vm/htable.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_kmem.h> +#include <vm/seg_vmm.h> + + +static int segvmm_dup(struct seg *, struct seg *); +static int segvmm_unmap(struct seg *, caddr_t, size_t); +static void segvmm_free(struct seg *); +static faultcode_t segvmm_fault(struct hat *, struct seg *, caddr_t, size_t, + enum fault_type, enum seg_rw); +static faultcode_t segvmm_faulta(struct seg *, caddr_t); +static int segvmm_setprot(struct seg *, caddr_t, size_t, uint_t); +static int segvmm_checkprot(struct seg *, caddr_t, size_t, uint_t); +static int segvmm_sync(struct seg *, caddr_t, size_t, int, uint_t); +static size_t segvmm_incore(struct seg *, caddr_t, size_t, char *); +static int segvmm_lockop(struct seg *, caddr_t, size_t, int, int, ulong_t *, + size_t); +static int segvmm_getprot(struct seg *, caddr_t, size_t, uint_t *); +static u_offset_t segvmm_getoffset(struct seg *, caddr_t); +static int segvmm_gettype(struct seg *, caddr_t); +static int segvmm_getvp(struct seg *, caddr_t, struct vnode **); +static int segvmm_advise(struct seg *, caddr_t, size_t, uint_t); +static void segvmm_dump(struct seg *); +static int segvmm_pagelock(struct seg *, caddr_t, size_t, struct page ***, + enum lock_type, enum seg_rw); +static int segvmm_setpagesize(struct seg *, caddr_t, size_t, uint_t); +static int segvmm_getmemid(struct seg *, caddr_t, memid_t *); +static int segvmm_capable(struct seg *, segcapability_t); + +static struct seg_ops segvmm_ops = { + .dup = segvmm_dup, + .unmap = segvmm_unmap, + .free = segvmm_free, + .fault = segvmm_fault, + .faulta = segvmm_faulta, + .setprot = segvmm_setprot, + .checkprot = segvmm_checkprot, + .kluster = NULL, + .swapout = NULL, + .sync = segvmm_sync, + .incore = segvmm_incore, + .lockop = segvmm_lockop, + .getprot = segvmm_getprot, + .getoffset = segvmm_getoffset, + .gettype = segvmm_gettype, + .getvp = segvmm_getvp, + .advise = segvmm_advise, + .dump = segvmm_dump, + .pagelock = segvmm_pagelock, + .setpagesize = segvmm_setpagesize, + .getmemid = segvmm_getmemid, + .getpolicy = NULL, + .capable = segvmm_capable, + .inherit = seg_inherit_notsup +}; + + +/* + * Create a kernel/user-mapped segment. ->kaddr is the segkvmm mapping. + */ +int +segvmm_create(struct seg **segpp, void *argsp) +{ + struct seg *seg = *segpp; + segvmm_crargs_t *cra = argsp; + segvmm_data_t *data; + + /* + * Check several aspects of the mapping request to ensure validity: + * - kernel pages must reside entirely in kernel space + * - target protection must be user-accessible + * - kernel address must be page-aligned + */ + if ((uintptr_t)cra->kaddr <= _userlimit || + ((uintptr_t)cra->kaddr + seg->s_size) < (uintptr_t)cra->kaddr || + (cra->prot & PROT_USER) == 0 || + ((uintptr_t)cra->kaddr & PAGEOFFSET) != 0) { + return (EINVAL); + } + + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + rw_init(&data->svmd_lock, NULL, RW_DEFAULT, NULL); + data->svmd_kaddr = (uintptr_t)cra->kaddr; + data->svmd_prot = cra->prot; + data->svmd_cookie = cra->cookie; + data->svmd_hold = cra->hold; + data->svmd_rele = cra->rele; + + /* Since initial checks have passed, grab a reference on the cookie */ + if (data->svmd_hold != NULL) { + data->svmd_hold(data->svmd_cookie); + } + + seg->s_ops = &segvmm_ops; + seg->s_data = data; + return (0); +} + +static int +segvmm_dup(struct seg *seg, struct seg *newseg) +{ + segvmm_data_t *svmd = seg->s_data; + segvmm_data_t *newsvmd; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + newsvmd = kmem_zalloc(sizeof (segvmm_data_t), KM_SLEEP); + rw_init(&newsvmd->svmd_lock, NULL, RW_DEFAULT, NULL); + newsvmd->svmd_kaddr = svmd->svmd_kaddr; + newsvmd->svmd_prot = svmd->svmd_prot; + newsvmd->svmd_cookie = svmd->svmd_cookie; + newsvmd->svmd_hold = svmd->svmd_hold; + newsvmd->svmd_rele = svmd->svmd_rele; + + /* Grab another hold for the duplicate segment */ + if (svmd->svmd_hold != NULL) { + newsvmd->svmd_hold(newsvmd->svmd_cookie); + } + + newseg->s_ops = seg->s_ops; + newseg->s_data = newsvmd; + return (0); +} + +static int +segvmm_unmap(struct seg *seg, caddr_t addr, size_t len) +{ + segvmm_data_t *svmd = seg->s_data; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + /* Only allow unmap of entire segment */ + if (addr != seg->s_base || len != seg->s_size) { + return (EINVAL); + } + if (svmd->svmd_softlockcnt != 0) { + return (EAGAIN); + } + + /* Unconditionally unload the entire segment range. */ + hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP); + + /* Release the hold this segment possessed */ + if (svmd->svmd_rele != NULL) { + svmd->svmd_rele(svmd->svmd_cookie); + } + + seg_free(seg); + return (0); +} + +static void +segvmm_free(struct seg *seg) +{ + segvmm_data_t *data = seg->s_data; + + ASSERT(data != NULL); + + rw_destroy(&data->svmd_lock); + VERIFY(data->svmd_softlockcnt == 0); + kmem_free(data, sizeof (*data)); + seg->s_data = NULL; +} + +static int +segvmm_fault_in(struct hat *hat, struct seg *seg, uintptr_t va, size_t len) +{ + segvmm_data_t *svmd = seg->s_data; + const uintptr_t koff = svmd->svmd_kaddr - (uintptr_t)seg->s_base; + const uintptr_t end = va + len; + const uintptr_t prot = svmd->svmd_prot; + + /* Stick to the simple non-large-page case for now */ + va &= PAGEMASK; + + do { + htable_t *ht; + uint_t entry, lvl; + size_t psz; + pfn_t pfn; + const uintptr_t kaddr = va + koff; + + ASSERT(kaddr >= (uintptr_t)svmd->svmd_kaddr); + ASSERT(kaddr < ((uintptr_t)svmd->svmd_kaddr + seg->s_size)); + + ht = htable_getpage(kas.a_hat, kaddr, &entry); + if (ht == NULL) { + return (-1); + } + lvl = ht->ht_level; + pfn = PTE2PFN(x86pte_get(ht, entry), lvl); + htable_release(ht); + if (pfn == PFN_INVALID) { + return (-1); + } + + /* For the time being, handling for large pages is absent. */ + psz = PAGESIZE; + pfn += mmu_btop(kaddr & LEVEL_OFFSET(lvl)); + + hat_devload(hat, (caddr_t)va, psz, pfn, prot, HAT_LOAD); + + va = va + psz; + } while (va < end); + + return (0); +} + +/* ARGSUSED */ +static faultcode_t +segvmm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, + enum fault_type type, enum seg_rw tw) +{ + segvmm_data_t *svmd = seg->s_data; + int err = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + if (type == F_PROT) { + /* + * Since protection on the segment is fixed, there is nothing + * to do but report an error for protection faults. + */ + return (FC_PROT); + } else if (type == F_SOFTUNLOCK) { + size_t plen = btop(len); + + rw_enter(&svmd->svmd_lock, RW_WRITER); + VERIFY(svmd->svmd_softlockcnt >= plen); + svmd->svmd_softlockcnt -= plen; + rw_exit(&svmd->svmd_lock); + return (0); + } + + VERIFY(type == F_INVAL || type == F_SOFTLOCK); + rw_enter(&svmd->svmd_lock, RW_WRITER); + + err = segvmm_fault_in(hat, seg, (uintptr_t)addr, len); + if (type == F_SOFTLOCK && err == 0) { + size_t nval = svmd->svmd_softlockcnt + btop(len); + + if (svmd->svmd_softlockcnt >= nval) { + rw_exit(&svmd->svmd_lock); + return (FC_MAKE_ERR(EOVERFLOW)); + } + svmd->svmd_softlockcnt = nval; + } + + rw_exit(&svmd->svmd_lock); + return (err); +} + +/* ARGSUSED */ +static faultcode_t +segvmm_faulta(struct seg *seg, caddr_t addr) +{ + /* Do nothing since asynch pagefault should not load translation. */ + return (0); +} + +/* ARGSUSED */ +static int +segvmm_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + /* The seg_vmm driver does not yet allow protection to be changed. */ + return (EACCES); +} + +/* ARGSUSED */ +static int +segvmm_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + segvmm_data_t *svmd = seg->s_data; + int error = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + rw_enter(&svmd->svmd_lock, RW_READER); + if ((svmd->svmd_prot & prot) != prot) { + error = EACCES; + } + rw_exit(&svmd->svmd_lock); + return (error); +} + +/* ARGSUSED */ +static int +segvmm_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) +{ + /* Always succeed since there are no backing store to sync */ + return (0); +} + +/* ARGSUSED */ +static size_t +segvmm_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) +{ + size_t sz = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + len = (len + PAGEOFFSET) & PAGEMASK; + while (len > 0) { + *vec = 1; + sz += PAGESIZE; + vec++; + len -= PAGESIZE; + } + return (sz); +} + +/* ARGSUSED */ +static int +segvmm_lockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op, + ulong_t *lockmap, size_t pos) +{ + /* Report success since kernel pages are always in memory. */ + return (0); +} + +static int +segvmm_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) +{ + segvmm_data_t *svmd = seg->s_data; + size_t pgno; + uint_t prot; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + rw_enter(&svmd->svmd_lock, RW_READER); + prot = svmd->svmd_prot; + rw_exit(&svmd->svmd_lock); + + /* + * Reporting protection is simple since it is not tracked per-page. + */ + pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; + while (pgno > 0) { + protv[--pgno] = prot; + } + return (0); +} + +/* ARGSUSED */ +static u_offset_t +segvmm_getoffset(struct seg *seg, caddr_t addr) +{ + /* + * To avoid leaking information about the layout of the kernel address + * space, always report '0' as the offset. + */ + return (0); +} + +/* ARGSUSED */ +static int +segvmm_gettype(struct seg *seg, caddr_t addr) +{ + /* + * Since already-existing kernel pages are being mapped into userspace, + * always report the segment type as shared. + */ + return (MAP_SHARED); +} + +/* ARGSUSED */ +static int +segvmm_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + *vpp = NULL; + return (0); +} + +/* ARGSUSED */ +static int +segvmm_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) +{ + if (behav == MADV_PURGE) { + /* Purge does not make sense for this mapping */ + return (EINVAL); + } + /* Indicate success for everything else. */ + return (0); +} + +/* ARGSUSED */ +static void +segvmm_dump(struct seg *seg) +{ + /* + * Since this is a mapping to share kernel data with userspace, nothing + * additional should be dumped. + */ +} + +/* ARGSUSED */ +static int +segvmm_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, + enum lock_type type, enum seg_rw rw) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static int +segvmm_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) +{ + return (ENOTSUP); +} + +static int +segvmm_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + segvmm_data_t *svmd = seg->s_data; + + memidp->val[0] = (uintptr_t)svmd->svmd_kaddr; + memidp->val[1] = (uintptr_t)(addr - seg->s_base); + return (0); +} + +/* ARGSUSED */ +static int +segvmm_capable(struct seg *seg, segcapability_t capability) +{ + /* no special capablities */ + return (0); +} diff --git a/usr/src/uts/i86pc/vm/seg_vmm.h b/usr/src/uts/i86pc/vm/seg_vmm.h new file mode 100644 index 0000000000..f5b95c6a27 --- /dev/null +++ b/usr/src/uts/i86pc/vm/seg_vmm.h @@ -0,0 +1,50 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VM_SEG_VMM_H +#define _VM_SEG_VMM_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct segvmm_crargs { + caddr_t kaddr; + uchar_t prot; /* protection */ + void *cookie; /* opaque resource backing memory */ + void (*hold)(void *); /* add reference to cookie */ + void (*rele)(void *); /* release reference to cookie */ +} segvmm_crargs_t; + +typedef void (*segvmm_holdfn_t)(void *); +typedef void (*segvmm_relefn_t)(void *); + +typedef struct segvmm_data { + krwlock_t svmd_lock; + uintptr_t svmd_kaddr; + uchar_t svmd_prot; + void *svmd_cookie; + segvmm_holdfn_t svmd_hold; + segvmm_relefn_t svmd_rele; + size_t svmd_softlockcnt; +} segvmm_data_t; + +extern int segvmm_create(struct seg **, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_VMM_H */ diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c index c1326a6643..9cba487a0a 100644 --- a/usr/src/uts/i86pc/vm/vm_machdep.c +++ b/usr/src/uts/i86pc/vm/vm_machdep.c @@ -711,10 +711,8 @@ void map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) { struct proc *p = curproc; - caddr_t userlimit = (flags & _MAP_LOW32) ? - (caddr_t)_userlimit32 : p->p_as->a_userlimit; - - map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); + map_addr_proc(addrp, len, off, vacalign, + map_userlimit(p, p->p_as, flags), curproc, flags); } /*ARGSUSED*/ @@ -3579,7 +3577,7 @@ page_create_io( if (nscan < desscan && freemem < minfree) { TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); } if (flags & PG_PHYSCONTIG) { diff --git a/usr/src/uts/i86pc/vmm/Makefile b/usr/src/uts/i86pc/vmm/Makefile new file mode 100644 index 0000000000..018a05ab92 --- /dev/null +++ b/usr/src/uts/i86pc/vmm/Makefile @@ -0,0 +1,155 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = vmm +OBJECTS = $(VMM_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(VMM_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/i86pc/io/vmm +MAPFILE = $(UTSBASE)/i86pc/io/vmm/vmm.mapfile + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Overrides and additions +# +LINTTAGS += -erroff=E_EMPTY_DECLARATION +LINTTAGS += -erroff=E_OPERANDS_INCOMPATIBLE_TYPES +LINTTAGS += -erroff=E_VOID_CANT_RETURN_VALUE +LINTTAGS += -erroff=E_YACC_ERROR +LINTTAGS += -erroff=E_STATIC_UNUSED +LINTTAGS += -erroff=E_FUNC_RET_MAYBE_IGNORED2 +LINTTAGS += -erroff=E_FUNC_RET_ALWAYS_IGNOR2 +LINTTAGS += -erroff=E_BAD_FORMAT_ARG_TYPE2 +LINTTAGS += -erroff=E_FUNC_ARG_UNUSED +LINTTAGS += -erroff=E_FUNC_SET_NOT_USED +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_CONSTANT_CONDITION +LINTTAGS += -erroff=E_PTR_TO_VOID_IN_ARITHMETIC +LINTTAGS += -erroff=E_CONST_TRUNCATED_BY_ASSIGN +LINTTAGS += -erroff=E_NOP_ELSE_STMT +LINTTAGS += -erroff=E_FUNC_NO_RET_VAL +LINTTAGS += -erroff=E_OLD_STYLE_DECL_OR_BAD_TYPE +LINTTAGS += -erroff=E_VAR_USED_BEFORE_SET +LINTTAGS += -erroff=E_INTEGER_OVERFLOW_DETECTED +LINTTAGS += -erroff=E_STMT_NOT_REACHED +LINTTAGS += -erroff=E_FUNC_NO_RET_VAL +LINTTAGS += -erroff=E_USELESS_DECLARATION +LINTTAGS += -erroff=E_EXPR_NULL_EFFECT +LINTTAGS += -erroff=E_CASE_FALLTHRU +LINTTAGS += -erroff=E_FUNC_DECL_VAR_ARG2 +LINTTAGS += -erroff=E_ASM_IMPOSSIBLE_CONSTRAINT +LINTTAGS += -erroff=E_ASM_UNUSED_PARAM +LINTTAGS += -erroff=E_NOP_IF_STMT +LINTTAGS += -erroff=E_ZERO_OR_NEGATIVE_SUBSCRIPT + +CERRWARN += -_gcc=-Wno-empty-body + +# 3rd party code +SMOFF += all_func_returns + +# needs work +$(OBJS_DIR)/vmm_sol_dev.o := SMOFF += signed_integer_overflow_check + +# a can't happen: vmx_setcap() warn: variable dereferenced before check 'pptr' +$(OBJS_DIR)/vmx.o := SMOFF += deref_check + +# These sources only compile with gcc. Workaround a confluence of cruft +# regarding dmake and shadow compilation by neutering the sun compiler. +#amd64_CC = $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc +#CFLAGS += -_cc=-xdryrun + +ALL_BUILDS = $(ALL_BUILDSONLY64) +DEF_BUILDS = $(DEF_BUILDSONLY64) +PRE_INC_PATH = -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64 \ + -I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64 +INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(UTSBASE)/i86pc/io/vmm/io +AS_INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(OBJS_DIR) + +CFLAGS += -_gcc=-Wimplicit-function-declaration +# The FreeBSD %# notation makes gcc gripe +CFLAGS += -_gcc=-Wno-format +# enable collection of VMM statistics +CFLAGS += -DVMM_KEEP_STATS + +$(OBJS_DIR)/vmm.o := CERRWARN += -_gcc=-Wno-pointer-sign -_gcc=-Wno-type-limits +$(OBJS_DIR)/svm.o := CERRWARN += -_gcc=-Wno-pointer-sign -_gcc=-Wno-type-limits +$(OBJS_DIR)/vmx.o := CERRWARN += -_gcc=-Wno-unused-variable +$(OBJS_DIR)/iommu.o := CERRWARN += -_gcc=-Wno-unused-variable + +LDFLAGS += -N misc/acpica -N misc/pcie -N fs/dev -z type=kmod +LDFLAGS += -M $(MAPFILE) + +OFFSETS_VMX = $(CONF_SRCDIR)/intel/offsets.in +OFFSETS_SVM = $(CONF_SRCDIR)/amd/offsets.in +ASSYM_VMX = $(OBJS_DIR)/vmx_assym.h +ASSYM_SVM = $(OBJS_DIR)/svm_assym.h +ASSYM_H = $(ASSYM_VMX) $(ASSYM_SVM) + +CLEANFILES += $(ASSYM_H) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ + +$(ASSYM_VMX): $(OFFSETS_VMX) $(GENASSYM) + $(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_VMX) >$@ +$(ASSYM_SVM): $(OFFSETS_SVM) $(GENASSYM) + $(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_SVM) >$@ + +$(OBJS_DIR)/vmx_support.o: $(ASSYM_VMX) +$(OBJS_DIR)/svm_support.o: $(ASSYM_SVM) |
