summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Mooney <pmooney@pfmooney.com>2021-09-05 01:38:39 +0000
committerPatrick Mooney <pmooney@oxide.computer>2021-11-19 23:00:59 +0000
commit0153d828c132fdb1a17c11b99386a3d1b87994cf (patch)
treec670df2f1d9cfceb92709c3cb2862fdd1f97f90a
parentd8f839f91e21bea2f5200f95df55608cbecdeeb9 (diff)
downloadillumos-joyent-0153d828c132fdb1a17c11b99386a3d1b87994cf.tar.gz
13896 bhyve VM interfaces should be better fit
13981 bhyve emulation should set dirty bits Reviewed by: Dan Cross <cross@oxidecomputer.com> Reviewed by: Joshua M. Clulow <josh@sysmgr.org> Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r--usr/src/compat/bhyve/amd64/machine/md_var.h2
-rw-r--r--usr/src/compat/bhyve/amd64/machine/pmap.h489
-rw-r--r--usr/src/compat/bhyve/amd64/machine/smp.h30
-rw-r--r--usr/src/compat/bhyve/sys/smp.h26
-rw-r--r--usr/src/contrib/bhyve/amd64/machine/vm.h45
-rw-r--r--usr/src/uts/i86pc/Makefile.files5
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c2
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/npt.c77
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/npt.h38
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/svm.c44
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/svm_softc.h4
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/ept.c170
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/ept.h41
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/offsets.in13
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx.c153
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx.h12
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx_support.s50
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vtd.c2
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/iommu.c7
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/ppt.c1
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vlapic.c7
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vlapic.h4
-rw-r--r--usr/src/uts/i86pc/io/vmm/seg_vmm.c193
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h3
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h31
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h23
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h213
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm.c271
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_gpt.c147
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c126
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_lapic.c2
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_mem.c113
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_mem.h54
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c86
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c119
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c17
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c112
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c932
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_vm.c1430
-rw-r--r--usr/src/uts/intel/sys/x86_archext.h18
40 files changed, 2152 insertions, 2960 deletions
diff --git a/usr/src/compat/bhyve/amd64/machine/md_var.h b/usr/src/compat/bhyve/amd64/machine/md_var.h
index ed57a8bebc..ca3d68ef95 100644
--- a/usr/src/compat/bhyve/amd64/machine/md_var.h
+++ b/usr/src/compat/bhyve/amd64/machine/md_var.h
@@ -23,6 +23,4 @@ extern char cpu_vendor[]; /* CPU Origin code */
#include <sys/systm.h>
-#define Maxmem (physmax + 1)
-
#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_ */
diff --git a/usr/src/compat/bhyve/amd64/machine/pmap.h b/usr/src/compat/bhyve/amd64/machine/pmap.h
deleted file mode 100644
index 3b94d1b1a9..0000000000
--- a/usr/src/compat/bhyve/amd64/machine/pmap.h
+++ /dev/null
@@ -1,489 +0,0 @@
-/*
- * All rights reserved. This copyright notice is Copyright Management
- * Information under 17 USC 1202 and is included to protect this work and
- * deter copyright infringement. Removal or alteration of this Copyright
- * Management Information without the express written permission from
- * Pluribus Networks Inc is prohibited, and any such unauthorized removal
- * or alteration will be a violation of federal law.
- *
- * Copyright (c) 2003 Peter Wemm.
- * Copyright (c) 1991 Regents of the University of California.
- * All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * the Systems Programming Group of the University of Utah Computer
- * Science Department and William Jolitz of UUNET Technologies Inc.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * Derived from hp300 version by Mike Hibler, this version by William
- * Jolitz uses a recursive map [a pde points to the page directory] to
- * map the page tables using the pagetables themselves. This is done to
- * reduce the impact on kernel virtual memory for lots of sparse address
- * space, and to reduce the cost of memory to each process.
- *
- * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90
- * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91
- * $FreeBSD$
- */
-
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2014 Pluribus Networks Inc.
- */
-
-
-#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_
-#define _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_
-
-/*
- * Page-directory and page-table entries follow this format, with a few
- * of the fields not present here and there, depending on a lot of things.
- */
- /* ---- Intel Nomenclature ---- */
-#define X86_PG_V 0x001 /* P Valid */
-#define X86_PG_RW 0x002 /* R/W Read/Write */
-#define X86_PG_U 0x004 /* U/S User/Supervisor */
-#define X86_PG_NC_PWT 0x008 /* PWT Write through */
-#define X86_PG_NC_PCD 0x010 /* PCD Cache disable */
-#define X86_PG_A 0x020 /* A Accessed */
-#define X86_PG_M 0x040 /* D Dirty */
-#define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */
-#define X86_PG_PTE_PAT 0x080 /* PAT PAT index */
-#define X86_PG_G 0x100 /* G Global */
-#define X86_PG_AVAIL1 0x200 /* / Available for system */
-#define X86_PG_AVAIL2 0x400 /* < programmers use */
-#define X86_PG_AVAIL3 0x800 /* \ */
-#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */
-#define X86_PG_NX (1ul<<63) /* No-execute */
-#define X86_PG_AVAIL(x) (1ul << (x))
-
-/* Page level cache control fields used to determine the PAT type */
-#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
-#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
-
-/*
- * Intel extended page table (EPT) bit definitions.
- */
-#define EPT_PG_READ 0x001 /* R Read */
-#define EPT_PG_WRITE 0x002 /* W Write */
-#define EPT_PG_EXECUTE 0x004 /* X Execute */
-#define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */
-#define EPT_PG_PS 0x080 /* PS Page size */
-#define EPT_PG_A 0x100 /* A Accessed */
-#define EPT_PG_M 0x200 /* D Dirty */
-#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */
-
-/*
- * Define the PG_xx macros in terms of the bits on x86 PTEs.
- */
-#define PG_V X86_PG_V
-#define PG_RW X86_PG_RW
-#define PG_U X86_PG_U
-#define PG_NC_PWT X86_PG_NC_PWT
-#define PG_NC_PCD X86_PG_NC_PCD
-#define PG_A X86_PG_A
-#define PG_M X86_PG_M
-#define PG_PS X86_PG_PS
-#define PG_PTE_PAT X86_PG_PTE_PAT
-#define PG_G X86_PG_G
-#define PG_AVAIL1 X86_PG_AVAIL1
-#define PG_AVAIL2 X86_PG_AVAIL2
-#define PG_AVAIL3 X86_PG_AVAIL3
-#define PG_PDE_PAT X86_PG_PDE_PAT
-#define PG_NX X86_PG_NX
-#define PG_PDE_CACHE X86_PG_PDE_CACHE
-#define PG_PTE_CACHE X86_PG_PTE_CACHE
-
-/* Our various interpretations of the above */
-#define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */
-#define PG_MANAGED X86_PG_AVAIL2
-#define EPT_PG_EMUL_V X86_PG_AVAIL(52)
-#define EPT_PG_EMUL_RW X86_PG_AVAIL(53)
-#define PG_PROMOTED X86_PG_AVAIL(54) /* PDE only */
-#define PG_FRAME (0x000ffffffffff000ul)
-#define PG_PS_FRAME (0x000fffffffe00000ul)
-
-/*
- * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
- * (PTE) page mappings have identical settings for the following fields:
- */
-#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
- PG_M | PG_A | PG_U | PG_RW | PG_V)
-
-/*
- * Page Protection Exception bits
- */
-
-#define PGEX_P 0x01 /* Protection violation vs. not present */
-#define PGEX_W 0x02 /* during a Write cycle */
-#define PGEX_U 0x04 /* access from User mode (UPL) */
-#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */
-#define PGEX_I 0x10 /* during an instruction fetch */
-
-/*
- * undef the PG_xx macros that define bits in the regular x86 PTEs that
- * have a different position in nested PTEs. This is done when compiling
- * code that needs to be aware of the differences between regular x86 and
- * nested PTEs.
- *
- * The appropriate bitmask will be calculated at runtime based on the pmap
- * type.
- */
-#ifdef AMD64_NPT_AWARE
-#undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */
-#undef PG_G
-#undef PG_A
-#undef PG_M
-#undef PG_PDE_PAT
-#undef PG_PDE_CACHE
-#undef PG_PTE_PAT
-#undef PG_PTE_CACHE
-#undef PG_RW
-#undef PG_V
-#endif
-
-/*
- * Pte related macros. This is complicated by having to deal with
- * the sign extension of the 48th bit.
- */
-#define KVADDR(l4, l3, l2, l1) ( \
- ((unsigned long)-1 << 47) | \
- ((unsigned long)(l4) << PML4SHIFT) | \
- ((unsigned long)(l3) << PDPSHIFT) | \
- ((unsigned long)(l2) << PDRSHIFT) | \
- ((unsigned long)(l1) << PAGE_SHIFT))
-
-#define UVADDR(l4, l3, l2, l1) ( \
- ((unsigned long)(l4) << PML4SHIFT) | \
- ((unsigned long)(l3) << PDPSHIFT) | \
- ((unsigned long)(l2) << PDRSHIFT) | \
- ((unsigned long)(l1) << PAGE_SHIFT))
-
-/*
- * Number of kernel PML4 slots. Can be anywhere from 1 to 64 or so,
- * but setting it larger than NDMPML4E makes no sense.
- *
- * Each slot provides .5 TB of kernel virtual space.
- */
-#define NKPML4E 4
-
-#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */
-#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */
-#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */
-
-/*
- * NDMPML4E is the maximum number of PML4 entries that will be
- * used to implement the direct map. It must be a power of two,
- * and should generally exceed NKPML4E. The maximum possible
- * value is 64; using 128 will make the direct map intrude into
- * the recursive page table map.
- */
-#define NDMPML4E 8
-
-/*
- * These values control the layout of virtual memory. The starting address
- * of the direct map, which is controlled by DMPML4I, must be a multiple of
- * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
- *
- * Note: KPML4I is the index of the (single) level 4 page that maps
- * the KVA that holds KERNBASE, while KPML4BASE is the index of the
- * first level 4 page that maps VM_MIN_KERNEL_ADDRESS. If NKPML4E
- * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra
- * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to
- * KERNBASE.
- *
- * (KPML4I combines with KPDPI to choose where KERNBASE starts.
- * Or, in other words, KPML4I provides bits 39..47 of KERNBASE,
- * and KPDPI provides bits 30..38.)
- */
-#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */
-
-#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */
-#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
-
-#define KPML4I (NPML4EPG-1)
-#define KPDPI (NPDPEPG-2) /* kernbase at -2GB */
-
-/*
- * XXX doesn't really belong here I guess...
- */
-#define ISA_HOLE_START 0xa0000
-#define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START)
-
-#define PMAP_PCID_NONE 0xffffffff
-#define PMAP_PCID_KERN 0
-#define PMAP_PCID_OVERMAX 0x1000
-
-#ifndef LOCORE
-
-#ifdef __FreeBSD__
-#include <sys/queue.h>
-#include <sys/_cpuset.h>
-#include <sys/_lock.h>
-#include <sys/_mutex.h>
-
-#include <vm/_vm_radix.h>
-#endif /* __FreeBSD__ */
-
-typedef u_int64_t pd_entry_t;
-typedef u_int64_t pt_entry_t;
-typedef u_int64_t pdp_entry_t;
-typedef u_int64_t pml4_entry_t;
-
-/*
- * Address of current address space page table maps and directories.
- */
-#ifdef _KERNEL
-#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0))
-#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0))
-#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
-#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
-#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
-#define PTmap ((pt_entry_t *)(addr_PTmap))
-#define PDmap ((pd_entry_t *)(addr_PDmap))
-#define PDPmap ((pd_entry_t *)(addr_PDPmap))
-#define PML4map ((pd_entry_t *)(addr_PML4map))
-#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e))
-
-extern int nkpt; /* Initial number of kernel page tables */
-extern u_int64_t KPDPphys; /* physical address of kernel level 3 */
-extern u_int64_t KPML4phys; /* physical address of kernel level 4 */
-
-/*
- * virtual address to page table entry and
- * to physical address.
- * Note: these work recursively, thus vtopte of a pte will give
- * the corresponding pde that in turn maps it.
- */
-pt_entry_t *vtopte(vm_offset_t);
-#define vtophys(va) pmap_kextract(((vm_offset_t) (va)))
-#ifndef __FreeBSD__
-extern vm_paddr_t pmap_kextract(vm_offset_t);
-#endif
-
-#define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte)
-#define pte_load_clear(ptep) atomic_swap_long(ptep, 0)
-#define pte_store(ptep, pte) do { \
- *(u_long *)(ptep) = (u_long)(pte); \
-} while (0)
-#define pte_clear(ptep) pte_store(ptep, 0)
-
-#define pde_store(pdep, pde) pte_store(pdep, pde)
-
-extern pt_entry_t pg_nx;
-
-#endif /* _KERNEL */
-
-#ifdef __FreeBSD__
-/*
- * Pmap stuff
- */
-struct pv_entry;
-struct pv_chunk;
-
-/*
- * Locks
- * (p) PV list lock
- */
-struct md_page {
- TAILQ_HEAD(, pv_entry) pv_list; /* (p) */
- int pv_gen; /* (p) */
- int pat_mode;
-};
-#endif /* __FreeBSD__ */
-
-enum pmap_type {
- PT_X86, /* regular x86 page tables */
- PT_EPT, /* Intel's nested page tables */
- PT_RVI, /* AMD's nested page tables */
-};
-
-#ifdef __FreeBSD__
-struct pmap_pcids {
- uint32_t pm_pcid;
- uint32_t pm_gen;
-};
-
-/*
- * The kernel virtual address (KVA) of the level 4 page table page is always
- * within the direct map (DMAP) region.
- */
-struct pmap {
- struct mtx pm_mtx;
- pml4_entry_t *pm_pml4; /* KVA of level 4 page table */
- uint64_t pm_cr3;
- TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
- cpuset_t pm_active; /* active on cpus */
- enum pmap_type pm_type; /* regular or nested tables */
- struct pmap_statistics pm_stats; /* pmap statistics */
- struct vm_radix pm_root; /* spare page table pages */
- long pm_eptgen; /* EPT pmap generation id */
- int pm_flags;
- struct pmap_pcids pm_pcids[MAXCPU];
-};
-#endif /* __FreeBSD__ */
-
-/* flags */
-#define PMAP_NESTED_IPIMASK 0xff
-#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */
-#define PMAP_EMULATE_AD_BITS (1 << 9) /* needs A/D bits emulation */
-#define PMAP_SUPPORTS_EXEC_ONLY (1 << 10) /* execute only mappings ok */
-
-typedef struct pmap *pmap_t;
-
-#ifdef _KERNEL
-extern struct pmap kernel_pmap_store;
-#define kernel_pmap (&kernel_pmap_store)
-
-#define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx)
-#define PMAP_LOCK_ASSERT(pmap, type) \
- mtx_assert(&(pmap)->pm_mtx, (type))
-#define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx)
-#define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \
- NULL, MTX_DEF | MTX_DUPOK)
-#define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx)
-#define PMAP_MTX(pmap) (&(pmap)->pm_mtx)
-#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx)
-#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx)
-
-int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags);
-int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype);
-#endif
-
-#ifdef __FreeBSD__
-/*
- * For each vm_page_t, there is a list of all currently valid virtual
- * mappings of that page. An entry is a pv_entry_t, the list is pv_list.
- */
-typedef struct pv_entry {
- vm_offset_t pv_va; /* virtual address for mapping */
- TAILQ_ENTRY(pv_entry) pv_next;
-} *pv_entry_t;
-
-/*
- * pv_entries are allocated in chunks per-process. This avoids the
- * need to track per-pmap assignments.
- */
-#define _NPCM 3
-#define _NPCPV 168
-struct pv_chunk {
- pmap_t pc_pmap;
- TAILQ_ENTRY(pv_chunk) pc_list;
- uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */
- TAILQ_ENTRY(pv_chunk) pc_lru;
- struct pv_entry pc_pventry[_NPCPV];
-};
-
-#ifdef _KERNEL
-
-extern caddr_t CADDR1;
-extern pt_entry_t *CMAP1;
-extern vm_paddr_t phys_avail[];
-extern vm_paddr_t dump_avail[];
-extern vm_offset_t virtual_avail;
-extern vm_offset_t virtual_end;
-extern vm_paddr_t dmaplimit;
-extern int pmap_pcid_enabled;
-extern int invpcid_works;
-
-#define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode)
-#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0)
-#define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz))
-
-struct thread;
-
-void pmap_activate_sw(struct thread *);
-void pmap_bootstrap(vm_paddr_t *);
-int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde);
-int pmap_change_attr(vm_offset_t, vm_size_t, int);
-void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate);
-void pmap_init_pat(void);
-void pmap_kenter(vm_offset_t va, vm_paddr_t pa);
-void *pmap_kenter_temporary(vm_paddr_t pa, int i);
-vm_paddr_t pmap_kextract(vm_offset_t);
-void pmap_kremove(vm_offset_t);
-void *pmap_mapbios(vm_paddr_t, vm_size_t);
-void *pmap_mapdev(vm_paddr_t, vm_size_t);
-void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
-boolean_t pmap_page_is_mapped(vm_page_t m);
-void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
-void pmap_pinit_pml4(vm_page_t);
-void pmap_unmapdev(vm_offset_t, vm_size_t);
-void pmap_invalidate_page(pmap_t, vm_offset_t);
-void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t);
-void pmap_invalidate_all(pmap_t);
-void pmap_invalidate_cache(void);
-void pmap_invalidate_cache_pages(vm_page_t *pages, int count);
-void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva,
- boolean_t force);
-void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
-boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
-void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
-#endif /* _KERNEL */
-
-/* Return various clipped indexes for a given VA */
-static __inline vm_pindex_t
-pmap_pte_index(vm_offset_t va)
-{
-
- return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
-}
-
-static __inline vm_pindex_t
-pmap_pde_index(vm_offset_t va)
-{
-
- return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
-}
-
-static __inline vm_pindex_t
-pmap_pdpe_index(vm_offset_t va)
-{
-
- return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
-}
-
-static __inline vm_pindex_t
-pmap_pml4e_index(vm_offset_t va)
-{
-
- return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
-}
-
-#endif /* __FreeBSD__ */
-#endif /* !LOCORE */
-
-#endif /* !_COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */
diff --git a/usr/src/compat/bhyve/amd64/machine/smp.h b/usr/src/compat/bhyve/amd64/machine/smp.h
deleted file mode 100644
index 9c4f2d111b..0000000000
--- a/usr/src/compat/bhyve/amd64/machine/smp.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2013 Pluribus Networks Inc.
- * Copyright 2018 Joyent, Inc.
- */
-
-#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_
-#define _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_
-
-#ifdef _KERNEL
-
-/*
- * APIC-related functions are replaced with native calls rather than shims
- * which attempt to replicate the FreeBSD interfaces. This is empty, but will
- * remain present to appease sources which wish to include the path.
- */
-
-#endif /* _KERNEL */
-
-#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ */
diff --git a/usr/src/compat/bhyve/sys/smp.h b/usr/src/compat/bhyve/sys/smp.h
deleted file mode 100644
index 3d6413ce16..0000000000
--- a/usr/src/compat/bhyve/sys/smp.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2014 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
- */
-
-#ifndef _COMPAT_FREEBSD_SYS_SMP_H_
-#define _COMPAT_FREEBSD_SYS_SMP_H_
-
-#include <sys/cpuset.h>
-
-#define IPI_AST 0
-
-void ipi_cpu(int cpu, u_int ipi);
-
-#endif /* _COMPAT_FREEBSD_SYS_SMP_H_ */
diff --git a/usr/src/contrib/bhyve/amd64/machine/vm.h b/usr/src/contrib/bhyve/amd64/machine/vm.h
deleted file mode 100644
index 885c1607ea..0000000000
--- a/usr/src/contrib/bhyve/amd64/machine/vm.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*-
- * Copyright (c) 2009 Advanced Computing Technologies LLC
- * Written by: John H. Baldwin <jhb@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: head/sys/amd64/include/vm.h 233671 2012-03-29 16:51:22Z jhb $
- */
-
-#ifndef _MACHINE_VM_H_
-#define _MACHINE_VM_H_
-
-#include <machine/specialreg.h>
-
-/* Memory attributes. */
-#define VM_MEMATTR_UNCACHEABLE ((vm_memattr_t)PAT_UNCACHEABLE)
-#define VM_MEMATTR_WRITE_COMBINING ((vm_memattr_t)PAT_WRITE_COMBINING)
-#define VM_MEMATTR_WRITE_THROUGH ((vm_memattr_t)PAT_WRITE_THROUGH)
-#define VM_MEMATTR_WRITE_PROTECTED ((vm_memattr_t)PAT_WRITE_PROTECTED)
-#define VM_MEMATTR_WRITE_BACK ((vm_memattr_t)PAT_WRITE_BACK)
-#define VM_MEMATTR_WEAK_UNCACHEABLE ((vm_memattr_t)PAT_UNCACHED)
-
-#define VM_MEMATTR_DEFAULT VM_MEMATTR_WRITE_BACK
-
-#endif /* !_MACHINE_VM_H_ */
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index caa660725c..9b83a780a5 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -247,7 +247,6 @@ VMM_OBJS += vmm.o \
vmm_instruction_emul.o \
vmm_ioport.o \
vmm_lapic.o \
- vmm_mem.o \
vmm_stat.o \
vmm_util.o \
x86.o \
@@ -259,7 +258,6 @@ VMM_OBJS += vmm.o \
vlapic.o \
vrtc.o \
vpmtmr.o \
- ept.o \
vmcs.o \
vmx_msr.o \
vmx.o \
@@ -268,18 +266,17 @@ VMM_OBJS += vmm.o \
vtd_sol.o \
svm.o \
svm_msr.o \
- npt.o \
vmcb.o \
svm_support.o \
amdv.o \
vmm_gpt.o \
seg_vmm.o \
vmm_reservoir.o \
- vmm_sol_vm.o \
vmm_sol_glue.o \
vmm_sol_ept.o \
vmm_sol_rvi.o \
vmm_support.o \
+ vmm_vm.o \
vmm_zsd.o
VIONA_OBJS += viona_main.o \
diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c
index c7b43b85ef..c381e350ed 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c
@@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <sys/pcpu.h>
#include <sys/rman.h>
-#include <sys/smp.h>
#include <sys/sysctl.h>
#include <dev/pci/pcivar.h>
@@ -45,7 +44,6 @@ __FBSDID("$FreeBSD$");
#include <machine/resource.h>
#include <machine/vmm.h>
-#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/pci_cfgreg.h>
diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.c b/usr/src/uts/i86pc/io/vmm/amd/npt.c
deleted file mode 100644
index 6fc6825242..0000000000
--- a/usr/src/uts/i86pc/io/vmm/amd/npt.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice unmodified, this list of conditions, and the following
- * disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/systm.h>
-#include <sys/sysctl.h>
-
-#include <sys/vmm_vm.h>
-
-#include "npt.h"
-
-static int npt_flags;
-
-#define NPT_IPIMASK 0xFF
-
-/*
- * AMD nested page table init.
- */
-int
-svm_npt_init(int ipinum)
-{
- int enable_superpage = 1;
-
- npt_flags = ipinum & NPT_IPIMASK;
- TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage);
- if (enable_superpage)
- npt_flags |= PMAP_PDE_SUPERPAGE;
-
- return (0);
-}
-
-static int
-npt_pinit(pmap_t pmap)
-{
- return (pmap_pinit_type(pmap, PT_RVI, npt_flags));
-}
-
-struct vmspace *
-svm_npt_alloc(vm_offset_t min, vm_offset_t max)
-{
- return (vmspace_alloc(min, max, npt_pinit));
-}
-
-void
-svm_npt_free(struct vmspace *vmspace)
-{
- vmspace_free(vmspace);
-}
diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.h b/usr/src/uts/i86pc/io/vmm/amd/npt.h
deleted file mode 100644
index 95f3fbab9e..0000000000
--- a/usr/src/uts/i86pc/io/vmm/amd/npt.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice unmodified, this list of conditions, and the following
- * disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _SVM_NPT_H_
-#define _SVM_NPT_H_
-
-int svm_npt_init(int ipinum);
-struct vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max);
-void svm_npt_free(struct vmspace *vmspace);
-
-#endif /* _SVM_NPT_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c
index 65fc4c3d0f..8ffc1c6557 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c
@@ -45,7 +45,6 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/smp.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/pcpu.h>
@@ -60,7 +59,6 @@ __FBSDID("$FreeBSD$");
#include <machine/md_var.h>
#include <machine/reg.h>
#include <machine/specialreg.h>
-#include <machine/smp.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include <sys/vmm_instruction_emul.h>
@@ -79,7 +77,6 @@ __FBSDID("$FreeBSD$");
#include "svm.h"
#include "svm_softc.h"
#include "svm_msr.h"
-#include "npt.h"
SYSCTL_DECL(_hw_vmm);
SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
@@ -151,12 +148,11 @@ svm_cleanup(void)
}
static int
-svm_init(int ipinum)
+svm_init(void)
{
vmcb_clean &= VMCB_CACHE_DEFAULT;
svm_msr_init();
- svm_npt_init(ipinum);
return (0);
}
@@ -425,7 +421,7 @@ vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
* Initialize a virtual machine.
*/
static void *
-svm_vminit(struct vm *vm, pmap_t pmap)
+svm_vminit(struct vm *vm)
{
struct svm_softc *svm_sc;
struct svm_vcpu *vcpu;
@@ -447,7 +443,7 @@ svm_vminit(struct vm *vm, pmap_t pmap)
panic("contigmalloc of SVM IO bitmap failed");
svm_sc->vm = vm;
- svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
+ svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm));
/*
* Intercept read and write accesses to all MSRs.
@@ -1776,23 +1772,21 @@ svm_inject_recheck(struct svm_softc *sc, int vcpu,
static void
-check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, uint_t thiscpu)
+check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen)
{
struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
- long eptgen;
uint8_t flush;
- eptgen = pmap->pm_eptgen;
flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
- vcpustate->eptgen != eptgen);
+ vcpustate->nptgen != nptgen);
if (flush != VMCB_TLB_FLUSH_NOTHING) {
ctrl->asid = vcpustate->hma_asid.hsa_asid;
svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
}
ctrl->tlb_ctrl = flush;
- vcpustate->eptgen = eptgen;
+ vcpustate->nptgen = nptgen;
}
static void
@@ -1810,8 +1804,8 @@ flush_asid(struct svm_softc *sc, int vcpuid)
ctrl->tlb_ctrl = flush;
svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
/*
- * A potential future optimization: We could choose to update the eptgen
- * associated with the vCPU, since any pending eptgen change requiring a
+ * A potential future optimization: We could choose to update the nptgen
+ * associated with the vCPU, since any pending nptgen change requiring a
* flush will be satisfied by the one which has just now been queued.
*/
}
@@ -1899,7 +1893,7 @@ svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid)
* Start vcpu with specified RIP.
*/
static int
-svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
+svm_vmrun(void *arg, int vcpu, uint64_t rip)
{
struct svm_regctx *gctx;
struct svm_softc *svm_sc;
@@ -1908,6 +1902,7 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
struct vmcb_ctrl *ctrl;
struct vm_exit *vmexit;
struct vlapic *vlapic;
+ vm_client_t *vmc;
struct vm *vm;
uint64_t vmcb_pa;
int handled;
@@ -1921,6 +1916,7 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
vmexit = vm_exitinfo(vm, vcpu);
vlapic = vm_lapic(vm, vcpu);
+ vmc = vm_get_vmclient(vm, vcpu);
gctx = svm_get_guest_regctx(svm_sc, vcpu);
vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
@@ -1962,6 +1958,7 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
do {
enum event_inject_state inject_state;
+ uint64_t nptgen;
/*
* Initial event injection is complex and may involve mutex
@@ -2021,14 +2018,12 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
*/
ldt_sel = sldt();
- /* Activate the nested pmap on 'curcpu' */
- CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
-
/*
- * Check the pmap generation and the ASID generation to
- * ensure that the vcpu does not use stale TLB mappings.
+ * Check the vmspace and ASID generations to ensure that the
+ * vcpu does not use stale TLB mappings.
*/
- check_asid(svm_sc, vcpu, pmap, curcpu);
+ nptgen = vmc_table_enter(vmc);
+ check_asid(svm_sc, vcpu, curcpu, nptgen);
ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
vcpustate->dirty = 0;
@@ -2042,14 +2037,14 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
svm_dr_leave_guest(gctx);
vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
- CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
-
/* Restore host LDTR. */
lldt(ldt_sel);
/* #VMEXIT disables interrupts so re-enable them here. */
enable_gintr();
+ vmc_table_exit(vmc);
+
/* Update 'nextrip' */
vcpustate->nextrip = state->rip;
@@ -2477,6 +2472,7 @@ struct vmm_ops vmm_ops_amd = {
.init = svm_init,
.cleanup = svm_cleanup,
.resume = svm_restore,
+
.vminit = svm_vminit,
.vmrun = svm_vmrun,
.vmcleanup = svm_vmcleanup,
@@ -2486,8 +2482,6 @@ struct vmm_ops vmm_ops_amd = {
.vmsetdesc = svm_setdesc,
.vmgetcap = svm_getcap,
.vmsetcap = svm_setcap,
- .vmspace_alloc = svm_npt_alloc,
- .vmspace_free = svm_npt_free,
.vlapic_init = svm_vlapic_init,
.vlapic_cleanup = svm_vlapic_cleanup,
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h
index e3ac603e71..adf9bb8ddd 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h
@@ -50,7 +50,7 @@ struct svm_vcpu {
uint64_t nextrip; /* next instruction to be executed by guest */
int lastcpu; /* host cpu that the vcpu last ran on */
uint32_t dirty; /* state cache bits that must be cleared */
- long eptgen; /* pmap->pm_eptgen when the vcpu last ran */
+ uint64_t nptgen; /* page table gen when the vcpu last ran */
hma_svm_asid_t hma_asid;
boolean_t loaded;
} __aligned(PAGE_SIZE);
@@ -61,7 +61,7 @@ struct svm_vcpu {
struct svm_softc {
uint8_t apic_page[VM_MAXCPU][PAGE_SIZE];
struct svm_vcpu vcpu[VM_MAXCPU];
- vm_offset_t nptp; /* nested page table */
+ uint64_t nptp; /* nested page table (host PA) */
uint8_t *iopm_bitmap; /* shared by all vcpus */
uint8_t *msr_bitmap; /* shared by all vcpus */
struct vm *vm;
diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.c b/usr/src/uts/i86pc/io/vmm/intel/ept.c
deleted file mode 100644
index 49b01ebd36..0000000000
--- a/usr/src/uts/i86pc/io/vmm/intel/ept.c
+++ /dev/null
@@ -1,170 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * Copyright 2015 Pluribus Networks Inc.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/types.h>
-#include <sys/systm.h>
-#include <sys/smp.h>
-#include <sys/sysctl.h>
-#include <sys/hma.h>
-
-#include <machine/specialreg.h>
-#include <machine/vmm.h>
-#include <sys/vmm_vm.h>
-
-#include "ept.h"
-
-#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0))
-#define EPT_PWL4(cap) ((cap) & (1UL << 6))
-#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
-#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
-#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
-#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
-#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21))
-#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
-
-#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
-#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
- (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
-
-#define INVEPT_ALL_TYPES_MASK 0x6000000UL
-#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
- (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
-
-#define EPT_PWLEVELS 4 /* page walk levels */
-#define EPT_ENABLE_AD_BITS (1 << 6)
-
-SYSCTL_DECL(_hw_vmm);
-SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
- NULL);
-
-static int ept_enable_ad_bits;
-
-static int ept_pmap_flags;
-
-int
-ept_init(int ipinum)
-{
- int use_hw_ad_bits, use_superpages, use_exec_only;
- uint64_t cap;
-
- cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
-
- /*
- * Verify that:
- * - page walk length is 4 steps
- * - extended page tables can be laid out in write-back memory
- * - invvpid instruction with all possible types is supported
- * - invept instruction with all possible types is supported
- */
- if (!EPT_PWL4(cap) ||
- !EPT_MEMORY_TYPE_WB(cap) ||
- !INVVPID_SUPPORTED(cap) ||
- !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
- !INVEPT_SUPPORTED(cap) ||
- !INVEPT_ALL_TYPES_SUPPORTED(cap))
- return (EINVAL);
-
- ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK;
-
- use_superpages = 1;
- TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
- if (use_superpages && EPT_PDE_SUPERPAGE(cap))
- ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */
-
- use_hw_ad_bits = 1;
- TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits);
- if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
- ept_enable_ad_bits = 1;
- else
- ept_pmap_flags |= PMAP_EMULATE_AD_BITS;
-
- use_exec_only = 1;
- TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only);
- if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap))
- ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY;
-
- return (0);
-}
-
-void
-ept_invalidate_mappings(ulong_t eptp)
-{
- hma_vmx_invept_allcpus((uintptr_t)eptp);
-}
-
-static int
-ept_pinit(pmap_t pmap)
-{
-
- return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags));
-}
-
-struct vmspace *
-ept_vmspace_alloc(vm_offset_t min, vm_offset_t max)
-{
-
- return (vmspace_alloc(min, max, ept_pinit));
-}
-
-void
-ept_vmspace_free(struct vmspace *vmspace)
-{
-
- vmspace_free(vmspace);
-}
-
-uint64_t
-eptp(uint64_t pml4)
-{
- uint64_t eptp_val;
-
- eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK;
- if (ept_enable_ad_bits)
- eptp_val |= EPT_ENABLE_AD_BITS;
-
- return (eptp_val);
-}
diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.h b/usr/src/uts/i86pc/io/vmm/intel/ept.h
deleted file mode 100644
index e4a6d6c959..0000000000
--- a/usr/src/uts/i86pc/io/vmm/intel/ept.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _EPT_H_
-#define _EPT_H_
-
-struct vmx;
-
-int ept_init(int ipinum);
-void ept_invalidate_mappings(ulong_t eptp);
-struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max);
-void ept_vmspace_free(struct vmspace *vmspace);
-uint64_t eptp(uint64_t pml4);
-#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/offsets.in b/usr/src/uts/i86pc/io/vmm/intel/offsets.in
index d456693573..f467e7b1ca 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/offsets.in
+++ b/usr/src/uts/i86pc/io/vmm/intel/offsets.in
@@ -19,7 +19,6 @@
#include <sys/systm.h>
#include <sys/cpuvar.h>
-#include <machine/pmap.h>
#include <machine/vmm.h>
#include <sys/vmm_vm.h>
@@ -43,18 +42,6 @@ vmxctx
guest_r15 VMXCTX_GUEST_R15
guest_cr2 VMXCTX_GUEST_CR2
inst_fail_status VMXCTX_INST_FAIL_STATUS
- pmap VMXCTX_PMAP
-
-vmx
- eptgen VMX_EPTGEN
- eptp VMX_EPTP
-
-pmap
- pm_active PM_ACTIVE
- pm_eptgen PM_EPTGEN
-
-cpu
- cpu_id
\#define VM_SUCCESS 0
\#define VM_FAIL_INVALID 1
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
index c58ad471a1..533adcbbf2 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -48,7 +48,6 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/smp.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/pcpu.h>
@@ -60,13 +59,13 @@ __FBSDID("$FreeBSD$");
#include <sys/smt.h>
#include <sys/hma.h>
#include <sys/trap.h>
+#include <sys/archsystm.h>
#include <machine/psl.h>
#include <machine/cpufunc.h>
#include <machine/md_var.h>
#include <machine/reg.h>
#include <machine/segments.h>
-#include <machine/smp.h>
#include <machine/specialreg.h>
#include <machine/vmparam.h>
#include <sys/vmm_vm.h>
@@ -83,7 +82,6 @@ __FBSDID("$FreeBSD$");
#include "vlapic.h"
#include "vlapic_priv.h"
-#include "ept.h"
#include "vmcs.h"
#include "vmx.h"
#include "vmx_msr.h"
@@ -145,6 +143,22 @@ __FBSDID("$FreeBSD$");
(VM_ENTRY_INTO_SMM | \
VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+/*
+ * Cover the EPT capabilities used by bhyve at present:
+ * - 4-level page walks
+ * - write-back memory type
+ * - INVEPT operations (all types)
+ * - INVVPID operations (single-context only)
+ */
+#define EPT_CAPS_REQUIRED \
+ (IA32_VMX_EPT_VPID_PWL4 | \
+ IA32_VMX_EPT_VPID_TYPE_WB | \
+ IA32_VMX_EPT_VPID_INVEPT | \
+ IA32_VMX_EPT_VPID_INVEPT_SINGLE | \
+ IA32_VMX_EPT_VPID_INVEPT_ALL | \
+ IA32_VMX_EPT_VPID_INVVPID | \
+ IA32_VMX_EPT_VPID_INVVPID_SINGLE)
+
#define HANDLED 1
#define UNHANDLED 0
@@ -448,7 +462,7 @@ vmx_restore(void)
}
static int
-vmx_init(int ipinum)
+vmx_init(void)
{
int error;
uint64_t fixed0, fixed1;
@@ -587,11 +601,16 @@ vmx_init(int ipinum)
}
}
- /* Initialize EPT */
- error = ept_init(ipinum);
- if (error) {
- printf("vmx_init: ept initialization failed (%d)\n", error);
- return (error);
+ /*
+ * Check for necessary EPT capabilities
+ *
+ * TODO: Properly handle when IA32_VMX_EPT_VPID_HW_AD is missing and the
+ * hypervisor intends to utilize dirty page tracking.
+ */
+ uint64_t ept_caps = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
+ if ((ept_caps & EPT_CAPS_REQUIRED) != EPT_CAPS_REQUIRED) {
+ cmn_err(CE_WARN, "!Inadequate EPT capabilities: %lx", ept_caps);
+ return (EINVAL);
}
#ifdef __FreeBSD__
@@ -665,7 +684,7 @@ vmx_trigger_hostintr(int vector)
}
static void *
-vmx_vminit(struct vm *vm, pmap_t pmap)
+vmx_vminit(struct vm *vm)
{
uint16_t vpid[VM_MAXCPU];
int i, error, datasel;
@@ -682,7 +701,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
}
vmx->vm = vm;
- vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
+ vmx->eptp = vmspace_table_root(vm_get_vmspace(vm));
/*
* Clean up EPTP-tagged guest physical and combined mappings
@@ -693,7 +712,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
*
* Combined mappings for this EP4TA are also invalidated for all VPIDs.
*/
- ept_invalidate_mappings(vmx->eptp);
+ hma_vmx_invept_allcpus((uintptr_t)vmx->eptp);
vmx_msr_bitmap_initialize(vmx);
@@ -805,8 +824,8 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
vmcs_write(VMCS_VPID, vpid[i]);
if (guest_l1d_flush && !guest_l1d_flush_sw) {
- vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract(
- (vm_offset_t)&msr_load_list[0]));
+ vmcs_write(VMCS_ENTRY_MSR_LOAD,
+ vtophys(&msr_load_list[0]));
vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
nitems(msr_load_list));
vmcs_write(VMCS_EXIT_MSR_STORE, 0);
@@ -860,9 +879,6 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
vmx->state[i].nextrip = ~0;
vmx->state[i].lastcpu = NOCPU;
vmx->state[i].vpid = vpid[i];
-
-
- vmx->ctx[i].pmap = pmap;
}
return (vmx);
@@ -929,14 +945,16 @@ invvpid(uint64_t type, struct invvpid_desc desc)
* Invalidate guest mappings identified by its vpid from the TLB.
*/
static __inline void
-vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
+vmx_invvpid(struct vmx *vmx, int vcpu, int running)
{
struct vmxstate *vmxstate;
struct invvpid_desc invvpid_desc;
+ struct vmspace *vms;
vmxstate = &vmx->state[vcpu];
if (vmxstate->vpid == 0)
return;
+ vms = vm_get_vmspace(vmx->vm);
if (!running) {
/*
@@ -964,7 +982,7 @@ vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
* Note also that this will invalidate mappings tagged with 'vpid'
* for "all" EP4TAs.
*/
- if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
+ if (vmspace_table_gen(vms) == vmx->eptgen[curcpu]) {
invvpid_desc._res1 = 0;
invvpid_desc._res2 = 0;
invvpid_desc.vpid = vmxstate->vpid;
@@ -982,8 +1000,28 @@ vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
}
}
+static __inline void
+invept(uint64_t type, uint64_t eptp)
+{
+ int error;
+ struct invept_desc {
+ uint64_t eptp;
+ uint64_t _resv;
+ } desc = { eptp, 0 };
+
+ __asm __volatile("invept %[desc], %[type];"
+ VMX_SET_ERROR_CODE_ASM
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error != 0) {
+ panic("invvpid error %d", error);
+ }
+}
+
static void
-vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
{
struct vmxstate *vmxstate;
@@ -1014,7 +1052,7 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
- vmx_invvpid(vmx, vcpu, pmap, 1);
+ vmx_invvpid(vmx, vcpu, 1);
}
/*
@@ -1582,7 +1620,7 @@ vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
const uint64_t diff = crval ^ old;
/* Flush the TLB if the paging or write-protect bits are changing */
if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) {
- vmx_invvpid(vmx, vcpu, vmx->ctx[vcpu].pmap, 1);
+ vmx_invvpid(vmx, vcpu, 1);
}
vmcs_write(VMCS_GUEST_CR0, crval);
@@ -2558,24 +2596,18 @@ vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
* clear NMI blocking.
*/
static __inline void
-vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
+vmx_exit_handle_possible_nmi(struct vm_exit *vmexit)
{
- uint32_t intr_info;
-
- KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
+ ASSERT(!interrupts_enabled());
- if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
- return;
-
- intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
- KASSERT((intr_info & VMCS_INTR_VALID) != 0,
- ("VM exit interruption info invalid: %x", intr_info));
+ if (vmexit->u.vmx.exit_reason == EXIT_REASON_EXCEPTION) {
+ uint32_t intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
+ ASSERT(intr_info & VMCS_INTR_VALID);
- if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
- KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
- "to NMI has invalid vector: %x", intr_info));
- VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
- vmm_call_trap(T_NMIFLT);
+ if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
+ ASSERT3U(intr_info & 0xff, ==, IDT_NMI);
+ vmm_call_trap(T_NMIFLT);
+ }
}
}
@@ -2647,7 +2679,7 @@ vmx_dr_leave_guest(struct vmxctx *vmxctx)
}
static int
-vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
+vmx_run(void *arg, int vcpu, uint64_t rip)
{
int rc, handled, launched;
struct vmx *vmx;
@@ -2658,6 +2690,7 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
struct vlapic *vlapic;
uint32_t exit_reason;
bool tpr_shadow_active;
+ vm_client_t *vmc;
vmx = arg;
vm = vmx->vm;
@@ -2665,14 +2698,12 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
vmxctx = &vmx->ctx[vcpu];
vlapic = vm_lapic(vm, vcpu);
vmexit = vm_exitinfo(vm, vcpu);
+ vmc = vm_get_vmclient(vm, vcpu);
launched = 0;
tpr_shadow_active = vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW) &&
!vmx_cap_en(vmx, VMX_CAP_APICV) &&
(vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0;
- KASSERT(vmxctx->pmap == pmap,
- ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
-
vmx_msr_guest_enter(vmx, vcpu);
vmcs_load(vmcs_pa);
@@ -2691,9 +2722,10 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
vmcs_write(VMCS_HOST_CR3, rcr3());
vmcs_write(VMCS_GUEST_RIP, rip);
- vmx_set_pcpu_defaults(vmx, vcpu, pmap);
+ vmx_set_pcpu_defaults(vmx, vcpu);
do {
enum event_inject_state inject_state;
+ uint64_t eptgen;
KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
"%lx/%lx", __func__, vmcs_guest_rip(), rip));
@@ -2721,8 +2753,8 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
* because interrupts are disabled. The pending interrupt will
* be recognized as soon as the guest state is loaded.
*
- * The same reasoning applies to the IPI generated by
- * pmap_invalidate_ept().
+ * The same reasoning applies to the IPI generated by vmspace
+ * invalidation.
*/
disable_intr();
@@ -2804,10 +2836,28 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
vmx_tpr_shadow_enter(vlapic);
}
+ /*
+ * Indicate activation of vmspace (EPT) table just prior to VMX
+ * entry, checking for the necessity of an invept invalidation.
+ */
+ eptgen = vmc_table_enter(vmc);
+ if (vmx->eptgen[vcpu] != eptgen) {
+ /*
+ * VMspace generate does not match what was previously
+ * used for this CPU so all mappings associated with
+ * this EPTP must be invalidated.
+ */
+ invept(1, vmx->eptp);
+ vmx->eptgen[vcpu] = eptgen;
+ }
+
vmx_run_trace(vmx, vcpu);
vcpu_ustate_change(vm, vcpu, VU_RUN);
vmx_dr_enter_guest(vmxctx);
+
+ /* Perform VMX entry */
rc = vmx_enter_guest(vmxctx, vmx, launched);
+
vmx_dr_leave_guest(vmxctx);
vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
@@ -2823,16 +2873,18 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
vmexit->inst_length = vmexit_instruction_length();
vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
-
/* Update 'nextrip' */
vmx->state[vcpu].nextrip = rip;
if (rc == VMX_GUEST_VMEXIT) {
- vmx_exit_handle_nmi(vmx, vcpu, vmexit);
- enable_intr();
+ vmx_exit_handle_possible_nmi(vmexit);
+ }
+ enable_intr();
+ vmc_table_exit(vmc);
+
+ if (rc == VMX_GUEST_VMEXIT) {
handled = vmx_exit_process(vmx, vcpu, vmexit);
} else {
- enable_intr();
vmx_exit_inst_error(vmxctx, rc, vmexit);
}
DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, rip,
@@ -3077,7 +3129,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
* XXX the processor retains global mappings when %cr3
* is updated but vmx_invvpid() does not.
*/
- vmx_invvpid(vmx, vcpu, vmx->ctx[vcpu].pmap, running);
+ vmx_invvpid(vmx, vcpu, running);
break;
case VMCS_INVALID_ENCODING:
error = EINVAL;
@@ -3647,6 +3699,7 @@ struct vmm_ops vmm_ops_intel = {
.init = vmx_init,
.cleanup = vmx_cleanup,
.resume = vmx_restore,
+
.vminit = vmx_vminit,
.vmrun = vmx_run,
.vmcleanup = vmx_vmcleanup,
@@ -3656,8 +3709,6 @@ struct vmm_ops vmm_ops_intel = {
.vmsetdesc = vmx_setdesc,
.vmgetcap = vmx_getcap,
.vmsetcap = vmx_setcap,
- .vmspace_alloc = ept_vmspace_alloc,
- .vmspace_free = ept_vmspace_free,
.vlapic_init = vmx_vlapic_init,
.vlapic_cleanup = vmx_vlapic_cleanup,
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
index c0d1fdd7fb..8ca7d993f7 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.h
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
@@ -39,7 +39,7 @@
* http://www.illumos.org/license/CDDL.
*
* Copyright 2018 Joyent, Inc.
- * Copyright 2020 Oxide Computer Company
+ * Copyright 2021 Oxide Computer Company
*/
#ifndef _VMX_H_
@@ -47,8 +47,6 @@
#include "vmcs.h"
-struct pmap;
-
struct vmxctx {
uint64_t guest_rdi; /* Guest state */
uint64_t guest_rsi;
@@ -82,12 +80,6 @@ struct vmxctx {
int host_tf;
int inst_fail_status;
-
- /*
- * The pmap needs to be deactivated in vmx_enter_guest()
- * so keep a copy of the 'pmap' in each vmxctx.
- */
- struct pmap *pmap;
};
struct vmxcap {
@@ -151,7 +143,7 @@ struct vmx {
uint64_t eptp;
enum vmx_caps vmx_caps;
struct vm *vm;
- long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */
+ uint64_t eptgen[MAXCPU]; /* cached vmspace generation */
};
CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
index aba844e8c3..60f761d652 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
@@ -151,35 +151,7 @@ ENTRY_NP(vmx_enter_guest)
movq %rdi, %r12 /* vmxctx */
movq %rsi, %r13 /* vmx */
movl %edx, %r14d /* launch state */
- movq VMXCTX_PMAP(%rdi), %rbx
- /* Activate guest pmap on this cpu. */
- leaq PM_ACTIVE(%rbx), %rdi
- movl %gs:CPU_ID, %esi
- call cpuset_atomic_add
- movq %r12, %rdi
-
- /*
- * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
- * then we must invalidate all mappings associated with this EPTP.
- */
- movq PM_EPTGEN(%rbx), %r10
- movl %gs:CPU_ID, %eax
- cmpq %r10, VMX_EPTGEN(%r13, %rax, 8)
- je guest_restore
-
- /* Refresh 'vmx->eptgen[curcpu]' */
- movq %r10, VMX_EPTGEN(%r13, %rax, 8)
-
- /* Setup the invept descriptor on the host stack */
- pushq $0x0
- pushq VMX_EPTP(%r13)
- movl $0x1, %eax /* Single context invalidate */
- invept (%rsp), %rax
- leaq 0x10(%rsp), %rsp
- jbe invept_error /* Check invept instruction error */
-
-guest_restore:
/* Write the current %rsp into the VMCS to be restored on vmexit */
movl $VMCS_HOST_RSP, %eax
vmwrite %rsp, %rax
@@ -217,9 +189,6 @@ do_launch:
vmwrite_error:
movl $VMX_VMWRITE_ERROR, %eax
jmp decode_inst_error
-invept_error:
- movl $VMX_INVEPT_ERROR, %eax
- jmp decode_inst_error
decode_inst_error:
movl $VM_FAIL_VALID, %r11d
jz inst_error
@@ -227,13 +196,6 @@ decode_inst_error:
inst_error:
movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi)
- movq VMXCTX_PMAP(%rdi), %rdi
- leaq PM_ACTIVE(%rdi), %rdi
- movl %gs:CPU_ID, %esi
- movq %rax, %r12
- call cpuset_atomic_del
- movq %r12, %rax
-
movq VMXSTK_RBX(%rsp), %rbx
movq VMXSTK_R12(%rsp), %r12
movq VMXSTK_R13(%rsp), %r13
@@ -256,12 +218,6 @@ ALTENTRY(vmx_exit_guest)
/* Save guest state that is not automatically saved in the vmcs. */
VMX_GUEST_SAVE
- /* Deactivate guest pmap on this cpu. */
- movq VMXCTX_PMAP(%rdi), %rdi
- leaq PM_ACTIVE(%rdi), %rdi
- movl %gs:CPU_ID, %esi
- call cpuset_atomic_del
-
/*
* This will return to the caller of 'vmx_enter_guest()' with a return
* value of VMX_GUEST_VMEXIT.
@@ -287,12 +243,6 @@ ALTENTRY(vmx_exit_guest_flush_rsb)
/* Save guest state that is not automatically saved in the vmcs. */
VMX_GUEST_SAVE
- /* Deactivate guest pmap on this cpu. */
- movq VMXCTX_PMAP(%rdi), %rdi
- leaq PM_ACTIVE(%rdi), %rdi
- movl %gs:CPU_ID, %esi
- call cpuset_atomic_del
-
VMX_GUEST_FLUSH_SCRATCH
/*
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vtd.c b/usr/src/uts/i86pc/io/vmm/intel/vtd.c
index 8784c94b48..a3773b54f0 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vtd.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vtd.c
@@ -254,7 +254,7 @@ vtd_wbflush(struct vtdmap *vtdmap)
{
if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
- pmap_invalidate_cache();
+ invalidate_cache_all();
if (VTD_CAP_RWBF(vtdmap->cap)) {
vtdmap->gcr = VTD_GCR_WBF;
diff --git a/usr/src/uts/i86pc/io/vmm/io/iommu.c b/usr/src/uts/i86pc/io/vmm/io/iommu.c
index 3630c36680..8fec022977 100644
--- a/usr/src/uts/i86pc/io/vmm/io/iommu.c
+++ b/usr/src/uts/i86pc/io/vmm/io/iommu.c
@@ -48,7 +48,6 @@ __FBSDID("$FreeBSD$");
#include <sys/pci.h>
#include "vmm_util.h"
-#include "vmm_mem.h"
#include "iommu.h"
static int iommu_avail;
@@ -191,6 +190,12 @@ iommu_find_device(dev_info_t *dip, void *arg)
return (DDI_WALK_CONTINUE);
}
+
+static vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+ return (ptoa(physmax + 1));
+}
#endif
static void
diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.c b/usr/src/uts/i86pc/io/vmm/io/ppt.c
index 8f3a276a93..96cc728a74 100644
--- a/usr/src/uts/i86pc/io/vmm/io/ppt.c
+++ b/usr/src/uts/i86pc/io/vmm/io/ppt.c
@@ -42,7 +42,6 @@ __FBSDID("$FreeBSD$");
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/pciio.h>
-#include <sys/smp.h>
#include <sys/sysctl.h>
#include <dev/pci/pcivar.h>
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
index 8198ebfce6..06ee46c8e2 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
@@ -52,13 +52,12 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
-#include <sys/smp.h>
+#include <sys/cpuset.h>
#include <x86/specialreg.h>
#include <x86/apicreg.h>
#include <machine/clock.h>
-#include <machine/smp.h>
#include <machine/vmm.h>
@@ -1602,7 +1601,7 @@ vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
}
void
-vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum)
+vlapic_post_intr(struct vlapic *vlapic, int hostcpu)
{
/*
* Post an interrupt to the vcpu currently running on 'hostcpu'.
@@ -1616,7 +1615,7 @@ vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum)
if (vlapic->ops.post_intr)
(*vlapic->ops.post_intr)(vlapic, hostcpu);
else
- ipi_cpu(hostcpu, ipinum);
+ poke_cpu(hostcpu);
}
bool
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
index f490eff637..a46bae9d34 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
@@ -72,9 +72,9 @@ vcpu_notify_t vlapic_set_intr_ready(struct vlapic *vlapic, int vector,
/*
* Post an interrupt to the vcpu running on 'hostcpu'. This will use a
* hardware assist if available (e.g. Posted Interrupt) or fall back to
- * sending an 'ipinum' to interrupt the 'hostcpu'.
+ * sending an IPI to interrupt the 'hostcpu'.
*/
-void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum);
+void vlapic_post_intr(struct vlapic *vlapic, int hostcpu);
void vlapic_fire_cmci(struct vlapic *vlapic);
int vlapic_trigger_lvt(struct vlapic *vlapic, int vector);
diff --git a/usr/src/uts/i86pc/io/vmm/seg_vmm.c b/usr/src/uts/i86pc/io/vmm/seg_vmm.c
index 23a8da3bc5..863b283418 100644
--- a/usr/src/uts/i86pc/io/vmm/seg_vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/seg_vmm.c
@@ -46,8 +46,9 @@
typedef struct segvmm_data {
krwlock_t svmd_lock;
- vm_object_t svmd_obj;
- uintptr_t svmd_obj_off;
+ vm_object_t *svmd_vmo;
+ vm_client_t *svmd_vmc;
+ uintptr_t svmd_off;
uchar_t svmd_prot;
size_t svmd_softlockcnt;
} segvmm_data_t;
@@ -104,9 +105,41 @@ static struct seg_ops segvmm_ops = {
.inherit = seg_inherit_notsup
};
+/*
+ * Unload a region from the HAT for A/D tracking.
+ */
+static void
+segvmm_invalidate(void *arg, uintptr_t gpa, size_t sz)
+{
+ struct seg *seg = arg;
+ segvmm_data_t *svmd = seg->s_data;
+
+ /*
+ * Invalidations are only necessary (and configured) for vmspace
+ * mappings. Direct vm_object mappings are not involved.
+ */
+ ASSERT3P(svmd->svmd_vmo, ==, NULL);
+
+ /*
+ * The region being invalidated may overlap with all, some, or none of
+ * this segment. We are only concerned about that overlap.
+ */
+ const uintptr_t start = MAX(gpa, svmd->svmd_off);
+ const uintptr_t end = MIN(gpa + sz, svmd->svmd_off + seg->s_size);
+ if (start >= end) {
+ return;
+ }
+ ASSERT(start >= svmd->svmd_off && end <= svmd->svmd_off + seg->s_size);
+ ASSERT(start >= gpa && end <= gpa + sz);
+ const caddr_t unload_va = seg->s_base + (start - svmd->svmd_off);
+ const size_t unload_sz = (end - start);
+ ASSERT3U(unload_sz, <=, seg->s_size);
+
+ hat_unload(seg->s_as->a_hat, unload_va, unload_sz, HAT_UNLOAD);
+}
/*
- * Create a kernel/user-mapped segment. ->kaddr is the segkvmm mapping.
+ * Create a VMM-memory-backed segment.
*/
int
segvmm_create(struct seg **segpp, void *argsp)
@@ -115,17 +148,35 @@ segvmm_create(struct seg **segpp, void *argsp)
segvmm_crargs_t *cra = argsp;
segvmm_data_t *data;
+ VERIFY((cra->vmo == NULL && cra->vmc != NULL) ||
+ (cra->vmo != NULL && cra->vmc == NULL));
+ VERIFY(cra->prot & PROT_USER);
+ VERIFY0(cra->offset & PAGEOFFSET);
+
data = kmem_zalloc(sizeof (*data), KM_SLEEP);
rw_init(&data->svmd_lock, NULL, RW_DEFAULT, NULL);
- data->svmd_obj = cra->obj;
- data->svmd_obj_off = cra->offset;
- data->svmd_prot = cra->prot;
-
- /* Grab a hold on the VM object for the duration of this seg mapping */
- vm_object_reference(data->svmd_obj);
+ data->svmd_off = cra->offset;
+ data->svmd_prot = cra->prot & ~PROT_USER;
seg->s_ops = &segvmm_ops;
seg->s_data = data;
+
+ if (cra->vmo != NULL) {
+ data->svmd_vmo = cra->vmo;
+ /* Grab a hold on the VM object for the lifetime of segment */
+ vm_object_reference(data->svmd_vmo);
+ } else {
+ int err;
+
+ data->svmd_vmc = cra->vmc;
+ err = vmc_set_inval_cb(data->svmd_vmc, segvmm_invalidate, seg);
+ if (err != 0) {
+ seg->s_ops = NULL;
+ seg->s_data = NULL;
+ kmem_free(data, sizeof (*data));
+ return (err);
+ }
+ }
return (0);
}
@@ -139,15 +190,34 @@ segvmm_dup(struct seg *seg, struct seg *newseg)
newsvmd = kmem_zalloc(sizeof (segvmm_data_t), KM_SLEEP);
rw_init(&newsvmd->svmd_lock, NULL, RW_DEFAULT, NULL);
- newsvmd->svmd_obj = svmd->svmd_obj;
- newsvmd->svmd_obj_off = svmd->svmd_obj_off;
+ newsvmd->svmd_off = svmd->svmd_off;
newsvmd->svmd_prot = svmd->svmd_prot;
- /* Grab another hold for the duplicate segment */
- vm_object_reference(svmd->svmd_obj);
-
newseg->s_ops = seg->s_ops;
newseg->s_data = newsvmd;
+
+ if (svmd->svmd_vmo != NULL) {
+ /* Grab another hold for the duplicate segment */
+ vm_object_reference(svmd->svmd_vmo);
+ newsvmd->svmd_vmo = svmd->svmd_vmo;
+ } else {
+ int err;
+
+ newsvmd->svmd_vmc = vmc_clone(svmd->svmd_vmc);
+ /*
+ * The cloned client does not inherit the invalidation
+ * configuration, so attempt to set it here for the new segment.
+ */
+ err = vmc_set_inval_cb(newsvmd->svmd_vmc, segvmm_invalidate,
+ newseg);
+ if (err != 0) {
+ newseg->s_ops = NULL;
+ newseg->s_data = NULL;
+ kmem_free(newsvmd, sizeof (*newsvmd));
+ return (err);
+ }
+ }
+
return (0);
}
@@ -169,9 +239,6 @@ segvmm_unmap(struct seg *seg, caddr_t addr, size_t len)
/* Unconditionally unload the entire segment range. */
hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP);
- /* Release the VM object hold this segment possessed */
- vm_object_deallocate(svmd->svmd_obj);
-
seg_free(seg);
return (0);
}
@@ -179,35 +246,93 @@ segvmm_unmap(struct seg *seg, caddr_t addr, size_t len)
static void
segvmm_free(struct seg *seg)
{
- segvmm_data_t *data = seg->s_data;
+ segvmm_data_t *svmd = seg->s_data;
- ASSERT(data != NULL);
+ ASSERT(svmd != NULL);
- rw_destroy(&data->svmd_lock);
- VERIFY(data->svmd_softlockcnt == 0);
- kmem_free(data, sizeof (*data));
+ if (svmd->svmd_vmo != NULL) {
+ /* Release the VM object hold this segment possessed */
+ vm_object_release(svmd->svmd_vmo);
+ svmd->svmd_vmo = NULL;
+ } else {
+ vmc_destroy(svmd->svmd_vmc);
+ svmd->svmd_vmc = NULL;
+ }
+ rw_destroy(&svmd->svmd_lock);
+ VERIFY(svmd->svmd_softlockcnt == 0);
+ kmem_free(svmd, sizeof (*svmd));
seg->s_data = NULL;
}
static int
-segvmm_fault_in(struct hat *hat, struct seg *seg, uintptr_t va, size_t len)
+segvmm_fault_obj(struct hat *hat, struct seg *seg, uintptr_t va, size_t len)
{
segvmm_data_t *svmd = seg->s_data;
const uintptr_t end = va + len;
- const uintptr_t prot = svmd->svmd_prot;
+ const int prot = svmd->svmd_prot;
+ const int uprot = prot | PROT_USER;
+ vm_object_t *vmo = svmd->svmd_vmo;
+
+ ASSERT(vmo != NULL);
va &= PAGEMASK;
- uintptr_t off = va - (uintptr_t)seg->s_base;
+ uintptr_t off = va - (uintptr_t)seg->s_base + svmd->svmd_off;
do {
pfn_t pfn;
- pfn = vm_object_pfn(svmd->svmd_obj, off);
+ pfn = vm_object_pfn(vmo, off);
if (pfn == PFN_INVALID) {
- return (-1);
+ return (FC_NOMAP);
+ }
+
+ /* Ignore any large-page possibilities for now */
+ hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, uprot, HAT_LOAD);
+ va += PAGESIZE;
+ off += PAGESIZE;
+ } while (va < end);
+
+ return (0);
+}
+
+static int
+segvmm_fault_space(struct hat *hat, struct seg *seg, uintptr_t va, size_t len)
+{
+ segvmm_data_t *svmd = seg->s_data;
+ const uintptr_t end = va + len;
+ const int prot = svmd->svmd_prot;
+ const int uprot = prot | PROT_USER;
+ vm_client_t *vmc = svmd->svmd_vmc;
+
+ ASSERT(vmc != NULL);
+
+ va &= PAGEMASK;
+ uintptr_t off = va - (uintptr_t)seg->s_base + svmd->svmd_off;
+
+ do {
+ vm_page_t *vmp;
+ pfn_t pfn;
+
+ vmp = vmc_hold(vmc, off, prot);
+ if (vmp == NULL) {
+ return (FC_NOMAP);
}
+ pfn = vmp_get_pfn(vmp);
+ ASSERT3U(pfn, !=, PFN_INVALID);
+
/* Ignore any large-page possibilities for now */
- hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, prot, HAT_LOAD);
+ hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, uprot, HAT_LOAD);
+
+ if (vmp_release(vmp)) {
+ /*
+ * Region was unmapped from vmspace while we were
+ * loading it into this AS. Communicate it as if it
+ * were a fault.
+ */
+ hat_unload(hat, (caddr_t)va, PAGESIZE, HAT_UNLOAD);
+ return (FC_NOMAP);
+ }
+
va += PAGESIZE;
off += PAGESIZE;
} while (va < end);
@@ -218,7 +343,7 @@ segvmm_fault_in(struct hat *hat, struct seg *seg, uintptr_t va, size_t len)
/* ARGSUSED */
static faultcode_t
segvmm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
- enum fault_type type, enum seg_rw tw)
+ enum fault_type type, enum seg_rw rw)
{
segvmm_data_t *svmd = seg->s_data;
int err = 0;
@@ -244,7 +369,11 @@ segvmm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
VERIFY(type == F_INVAL || type == F_SOFTLOCK);
rw_enter(&svmd->svmd_lock, RW_WRITER);
- err = segvmm_fault_in(hat, seg, (uintptr_t)addr, len);
+ if (svmd->svmd_vmo != NULL) {
+ err = segvmm_fault_obj(hat, seg, (uintptr_t)addr, len);
+ } else {
+ err = segvmm_fault_space(hat, seg, (uintptr_t)addr, len);
+ }
if (type == F_SOFTLOCK && err == 0) {
size_t nval = svmd->svmd_softlockcnt + btop(len);
@@ -426,8 +555,8 @@ segvmm_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
{
segvmm_data_t *svmd = seg->s_data;
- memidp->val[0] = (uintptr_t)svmd->svmd_obj;
- memidp->val[1] = (uintptr_t)(addr - seg->s_base) + svmd->svmd_obj_off;
+ memidp->val[0] = (uintptr_t)svmd->svmd_vmo;
+ memidp->val[1] = (uintptr_t)(addr - seg->s_base) + svmd->svmd_off;
return (0);
}
diff --git a/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h b/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h
index a4f72f816e..5ba0dad5c3 100644
--- a/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h
+++ b/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h
@@ -21,8 +21,9 @@
typedef struct segvmm_crargs {
uchar_t prot; /* protection */
- vm_object_t obj;
uintptr_t offset;
+ vm_object_t *vmo;
+ vm_client_t *vmc;
} segvmm_crargs_t;
int segvmm_create(struct seg **, void *);
diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h
index 554f51bbb6..a425fb53ec 100644
--- a/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h
@@ -19,20 +19,6 @@
#include <sys/types.h>
-typedef struct vmm_pt_ops vmm_pt_ops_t;
-struct vmm_pt_ops {
- void * (*vpo_init)(uint64_t *);
- void (*vpo_free)(void *);
- uint64_t (*vpo_wired_cnt)(void *);
- int (*vpo_is_wired)(void *, uint64_t, uint_t *);
- int (*vpo_map)(void *, uint64_t, pfn_t, uint_t, uint_t,
- uint8_t);
- uint64_t (*vpo_unmap)(void *, uint64_t, uint64_t);
-};
-
-extern struct vmm_pt_ops ept_ops;
-extern struct vmm_pt_ops rvi_ops;
-
/*
* Constants for the nodes in the GPT radix tree. Note
* that, in accordance with hardware page table descriptions,
@@ -64,6 +50,8 @@ enum vmm_gpt_node_level {
* vpeo_reset_accessed: Resets the accessed bit on the given PTE. If the
* second argument is `true`, the bit will be set, otherwise it will be
* cleared. Returns non-zero if the previous value of the bit was set.
+ * vpeo_get_pmtp: Generate a properly formatted PML4 (EPTP/nCR3), given the root
+ * PFN for the GPT.
*/
typedef struct vmm_pte_ops vmm_pte_ops_t;
struct vmm_pte_ops {
@@ -74,30 +62,29 @@ struct vmm_pte_ops {
uint_t (*vpeo_pte_prot)(uint64_t);
uint_t (*vpeo_reset_dirty)(uint64_t *, bool);
uint_t (*vpeo_reset_accessed)(uint64_t *, bool);
+ uint64_t (*vpeo_get_pmtp)(pfn_t);
};
+extern vmm_pte_ops_t ept_pte_ops;
+extern vmm_pte_ops_t rvi_pte_ops;
+
struct vmm_gpt;
typedef struct vmm_gpt vmm_gpt_t;
-vmm_gpt_t *ept_create(void);
-vmm_gpt_t *rvi_create(void);
-
vmm_gpt_t *vmm_gpt_alloc(vmm_pte_ops_t *);
void vmm_gpt_free(vmm_gpt_t *);
-void *vmm_gpt_root_kaddr(vmm_gpt_t *);
-pfn_t vmm_gpt_root_pfn(vmm_gpt_t *);
uint64_t *vmm_gpt_lookup(vmm_gpt_t *, uint64_t);
void vmm_gpt_walk(vmm_gpt_t *, uint64_t, uint64_t **, enum vmm_gpt_node_level);
-void vmm_gpt_populate_entry(vmm_gpt_t *, uint64_t);
void vmm_gpt_populate_region(vmm_gpt_t *, uint64_t, uint64_t);
+bool vmm_gpt_map_at(vmm_gpt_t *, uint64_t *, pfn_t, uint_t, uint8_t);
void vmm_gpt_vacate_region(vmm_gpt_t *, uint64_t, uint64_t);
bool vmm_gpt_map(vmm_gpt_t *, uint64_t, pfn_t, uint_t, uint8_t);
bool vmm_gpt_unmap(vmm_gpt_t *, uint64_t);
size_t vmm_gpt_unmap_region(vmm_gpt_t *, uint64_t, uint64_t);
+uint64_t vmm_gpt_get_pmtp(vmm_gpt_t *);
-bool vmm_gpt_is_mapped(vmm_gpt_t *, uint64_t, uint_t *);
-size_t vmm_gpt_mapped_count(vmm_gpt_t *);
+bool vmm_gpt_is_mapped(vmm_gpt_t *, uint64_t *, pfn_t *, uint_t *);
uint_t vmm_gpt_reset_accessed(vmm_gpt_t *, uint64_t *, bool);
uint_t vmm_gpt_reset_dirty(vmm_gpt_t *, uint64_t *, bool);
diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h
index 3a50dafd6d..5f0ba4b875 100644
--- a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h
@@ -48,6 +48,7 @@
#include <sys/sdt.h>
#include <x86/segments.h>
+#include <sys/vmm.h>
SDT_PROVIDER_DECLARE(vmm);
@@ -61,16 +62,15 @@ struct vhpet;
struct vioapic;
struct vlapic;
struct vmspace;
+struct vm_client;
struct vm_object;
struct vm_guest_paging;
-struct pmap;
-typedef int (*vmm_init_func_t)(int ipinum);
+typedef int (*vmm_init_func_t)(void);
typedef int (*vmm_cleanup_func_t)(void);
typedef void (*vmm_resume_func_t)(void);
-typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
-typedef int (*vmi_run_func_t)(void *vmi, int vcpu, uint64_t rip,
- struct pmap *pmap);
+typedef void * (*vmi_init_func_t)(struct vm *vm);
+typedef int (*vmi_run_func_t)(void *vmi, int vcpu, uint64_t rip);
typedef void (*vmi_cleanup_func_t)(void *vmi);
typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
uint64_t *retval);
@@ -82,8 +82,6 @@ typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
const struct seg_desc *desc);
typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
-typedef struct vmspace *(*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
-typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
typedef struct vlapic *(*vmi_vlapic_init)(void *vmi, int vcpu);
typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
typedef void (*vmi_savectx)(void *vmi, int vcpu);
@@ -103,8 +101,6 @@ struct vmm_ops {
vmi_set_desc_t vmsetdesc;
vmi_get_cap_t vmgetcap;
vmi_set_cap_t vmsetcap;
- vmi_vmspace_alloc vmspace_alloc;
- vmi_vmspace_free vmspace_free;
vmi_vlapic_init vlapic_init;
vmi_vlapic_cleanup vlapic_cleanup;
@@ -148,9 +144,6 @@ int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
struct vm_object **objptr);
vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm);
-void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len,
- int prot, void **cookie);
-void vm_gpa_release(void *cookie);
bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa);
int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
@@ -261,6 +254,7 @@ void *vcpu_stats(struct vm *vm, int vcpu);
void vcpu_notify_event(struct vm *vm, int vcpuid);
void vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t);
struct vmspace *vm_get_vmspace(struct vm *vm);
+struct vm_client *vm_get_vmclient(struct vm *vm, int vcpuid);
struct vatpic *vm_atpic(struct vm *vm);
struct vatpit *vm_atpit(struct vm *vm);
struct vpmtmr *vm_pmtmr(struct vm *vm);
@@ -312,6 +306,7 @@ enum vm_reg_name vm_segment_name(int seg_encoding);
struct vm_copyinfo {
uint64_t gpa;
size_t len;
+ int prot;
void *hva;
void *cookie;
};
@@ -332,9 +327,9 @@ struct vm_copyinfo {
*/
int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
- int num_copyinfo, int *is_fault);
+ uint_t num_copyinfo, int *is_fault);
void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
- int num_copyinfo);
+ uint_t num_copyinfo);
void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
void *kaddr, size_t len);
void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h
index 76d5fec8b7..a01b909ff6 100644
--- a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h
@@ -18,40 +18,64 @@
#ifndef _VMM_VM_H
#define _VMM_VM_H
-#include <sys/list.h>
#include <sys/types.h>
-#include <vm/hat_pte.h>
-#include <machine/pmap.h>
-/*
- * vm_map_wire and vm_map_unwire option flags
- */
-#define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */
-#define VM_MAP_WIRE_USER 1 /* wiring in a user map */
-
-#define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */
-#define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */
-
-#define VM_MAP_WIRE_WRITE 4 /* Validate writable. */
-
-/*
- * The following "find_space" options are supported by vm_map_find().
- *
- * For VMFS_ALIGNED_SPACE, the desired alignment is specified to
- * the macro argument as log base 2 of the desired alignment.
- */
-#define VMFS_NO_SPACE 0 /* don't find; use the given range */
-#define VMFS_ANY_SPACE 1 /* find range with any alignment */
-#define VMFS_OPTIMAL_SPACE 2 /* find range with optimal alignment */
-#define VMFS_SUPER_SPACE 3 /* find superpage-aligned range */
-#define VMFS_ALIGNED_SPACE(x) ((x) << 8) /* find range with fixed alignment */
+typedef struct vmspace vmspace_t;
+typedef struct vm_client vm_client_t;
+typedef struct vm_page vm_page_t;
+typedef struct vm_object vm_object_t;
+
+struct vmm_pte_ops;
+
+typedef void (*vmc_inval_cb_t)(void *, uintptr_t, size_t);
+
+/* vmspace_t operations */
+vmspace_t *vmspace_alloc(size_t, struct vmm_pte_ops *, bool);
+void vmspace_destroy(vmspace_t *);
+int vmspace_map(vmspace_t *, vm_object_t *, uintptr_t, uintptr_t, size_t,
+ uint8_t);
+int vmspace_unmap(vmspace_t *, uintptr_t, uintptr_t);
+int vmspace_populate(vmspace_t *, uintptr_t, uintptr_t);
+vm_client_t *vmspace_client_alloc(vmspace_t *);
+uint64_t vmspace_table_root(vmspace_t *);
+uint64_t vmspace_table_gen(vmspace_t *);
+uint64_t vmspace_resident_count(vmspace_t *);
+
+/* vm_client_t operations */
+vm_page_t *vmc_hold(vm_client_t *, uintptr_t, int);
+uint64_t vmc_table_enter(vm_client_t *);
+void vmc_table_exit(vm_client_t *);
+int vmc_fault(vm_client_t *, uintptr_t, int);
+vm_client_t *vmc_clone(vm_client_t *);
+int vmc_set_inval_cb(vm_client_t *, vmc_inval_cb_t, void *);
+void vmc_destroy(vm_client_t *);
+
+/* vm_object_t operations */
+vm_object_t *vm_object_mem_allocate(size_t, bool);
+vm_object_t *vmm_mmio_alloc(vmspace_t *, uintptr_t, size_t, uintptr_t);
+void vm_object_reference(vm_object_t *);
+void vm_object_release(vm_object_t *);
+pfn_t vm_object_pfn(vm_object_t *, uintptr_t);
+
+/* vm_page_t operations */
+const void *vmp_get_readable(const vm_page_t *);
+void *vmp_get_writable(const vm_page_t *);
+pfn_t vmp_get_pfn(const vm_page_t *);
+void vmp_chain(vm_page_t *, vm_page_t *);
+vm_page_t *vmp_next(const vm_page_t *);
+bool vmp_release(vm_page_t *);
+bool vmp_release_chain(vm_page_t *);
+
+/* seg_vmm mapping */
+struct vm;
+int vm_segmap_obj(struct vm *, int, off_t, off_t, struct as *, caddr_t *,
+ uint_t, uint_t, uint_t);
+int vm_segmap_space(struct vm *, off_t, struct as *, caddr_t *, off_t, uint_t,
+ uint_t, uint_t);
-/*
- * vm_fault option flags
- */
-#define VM_FAULT_NORMAL 0 /* Nothing special */
-#define VM_FAULT_WIRE 1 /* Wire the mapped page */
-#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/PROT_COPY */
+/* Glue functions */
+vm_paddr_t vtophys(void *);
+void invalidate_cache_all(void);
/*
* The VM_MAXUSER_ADDRESS determines the upper size limit of a vmspace.
@@ -61,131 +85,4 @@
*/
#define VM_MAXUSER_ADDRESS 0x00003ffffffffffful
-/*
- * Type definitions used in the hypervisor.
- */
-typedef uchar_t vm_prot_t;
-
-/* New type declarations. */
-struct vm;
-struct vmspace;
-struct pmap;
-
-struct vm_object;
-typedef struct vm_object *vm_object_t;
-
-struct vmm_pt_ops;
-
-struct vm_page;
-typedef struct vm_page *vm_page_t;
-
-enum obj_type { OBJT_DEFAULT, OBJT_SWAP, OBJT_VNODE, OBJT_DEVICE, OBJT_PHYS,
- OBJT_DEAD, OBJT_SG, OBJT_MGTDEVICE };
-typedef uchar_t objtype_t;
-
-union vm_map_object;
-typedef union vm_map_object vm_map_object_t;
-
-struct vm_map_entry;
-typedef struct vm_map_entry *vm_map_entry_t;
-
-struct vm_map;
-typedef struct vm_map *vm_map_t;
-
-pmap_t vmspace_pmap(struct vmspace *);
-
-int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t,
- vm_offset_t, int, vm_prot_t, vm_prot_t, int);
-int vm_map_remove(vm_map_t, vm_offset_t, vm_offset_t);
-int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags);
-
-long vmspace_resident_count(struct vmspace *vmspace);
-
-void pmap_invalidate_cache(void);
-void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
-int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype);
-long pmap_wired_count(pmap_t pmap);
-
-struct vm_map {
- struct vmspace *vmm_space;
-};
-
-struct pmap {
- void *pm_pml4;
- cpuset_t pm_active;
- long pm_eptgen;
-
- /* Implementation private */
- enum pmap_type pm_type;
- struct vmm_pt_ops *pm_ops;
- void *pm_impl;
-};
-
-struct vmspace {
- struct vm_map vm_map;
-
- /* Implementation private */
- kmutex_t vms_lock;
- boolean_t vms_map_changing;
- struct pmap vms_pmap;
- uintptr_t vms_size; /* fixed after creation */
-
- list_t vms_maplist;
-};
-
-typedef pfn_t (*vm_pager_fn_t)(vm_object_t, uintptr_t, pfn_t *, uint_t *);
-
-struct vm_object {
- uint_t vmo_refcnt; /* manipulated with atomic ops */
-
- /* This group of fields are fixed at creation time */
- objtype_t vmo_type;
- size_t vmo_size;
- vm_pager_fn_t vmo_pager;
- void *vmo_data;
-
- kmutex_t vmo_lock; /* protects fields below */
- vm_memattr_t vmo_attr;
-};
-
-struct vm_page {
- kmutex_t vmp_lock;
- pfn_t vmp_pfn;
- struct vm_object *vmp_obj_held;
-};
-
-/* illumos-specific functions for setup and operation */
-int vm_segmap_obj(vm_object_t, off_t, size_t, struct as *, caddr_t *, uint_t,
- uint_t, uint_t);
-int vm_segmap_space(struct vmspace *, off_t, struct as *, caddr_t *, off_t,
- uint_t, uint_t, uint_t);
-void *vmspace_find_kva(struct vmspace *, uintptr_t, size_t);
-
-typedef int (*pmap_pinit_t)(struct pmap *pmap);
-
-struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t);
-void vmspace_free(struct vmspace *);
-
-int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
-int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
- vm_prot_t prot, vm_page_t *ma, int max_count);
-
-struct vm_object *vm_object_allocate(objtype_t, vm_pindex_t, bool);
-void vm_object_deallocate(vm_object_t);
-void vm_object_reference(vm_object_t);
-int vm_object_set_memattr(vm_object_t, vm_memattr_t);
-pfn_t vm_object_pfn(vm_object_t, uintptr_t);
-
-#define VM_OBJECT_WLOCK(vmo) mutex_enter(&(vmo)->vmo_lock)
-#define VM_OBJECT_WUNLOCK(vmo) mutex_exit(&(vmo)->vmo_lock)
-
-#define PQ_ACTIVE 1
-
-void vm_page_unwire(vm_page_t, uint8_t);
-
-#define VM_PAGE_TO_PHYS(page) (mmu_ptob((uintptr_t)((page)->vmp_pfn)))
-
-vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t,
- vm_ooffset_t, void *);
-
#endif /* _VMM_VM_H */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index f95e415e40..998e483ecf 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -58,12 +58,10 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
-#include <sys/smp.h>
#include <sys/systm.h>
#include <sys/sunddi.h>
#include <machine/pcb.h>
-#include <machine/smp.h>
#include <machine/md_var.h>
#include <x86/psl.h>
#include <x86/apicreg.h>
@@ -74,11 +72,11 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
#include <sys/vmm_instruction_emul.h>
#include <sys/vmm_vm.h>
+#include <sys/vmm_gpt.h>
#include "vmm_ioport.h"
#include "vmm_ktr.h"
#include "vmm_host.h"
-#include "vmm_mem.h"
#include "vmm_util.h"
#include "vatpic.h"
#include "vatpit.h"
@@ -129,6 +127,7 @@ struct vcpu {
struct vm_exit exitinfo; /* (x) exit reason and collateral */
uint64_t nextrip; /* (x) next instruction to execute */
struct vie *vie_ctx; /* (x) instruction emulation context */
+ vm_client_t *vmclient; /* (a) VM-system client */
uint64_t tsc_offset; /* (x) offset from host TSC */
enum vcpu_ustate ustate; /* (i) microstate for the vcpu */
@@ -145,7 +144,7 @@ struct vcpu {
struct mem_seg {
size_t len;
bool sysmem;
- struct vm_object *object;
+ vm_object_t *object;
};
#define VM_MAX_MEMSEGS 4
@@ -219,8 +218,6 @@ static struct vmm_ops vmm_ops_null = {
.vmsetdesc = (vmi_set_desc_t)nullop_panic,
.vmgetcap = (vmi_get_cap_t)nullop_panic,
.vmsetcap = (vmi_set_cap_t)nullop_panic,
- .vmspace_alloc = (vmi_vmspace_alloc)nullop_panic,
- .vmspace_free = (vmi_vmspace_free)nullop_panic,
.vlapic_init = (vmi_vlapic_init)nullop_panic,
.vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
.vmsavectx = (vmi_savectx)nullop_panic,
@@ -228,17 +225,15 @@ static struct vmm_ops vmm_ops_null = {
};
static struct vmm_ops *ops = &vmm_ops_null;
+static vmm_pte_ops_t *pte_ops = NULL;
-#define VMM_INIT(num) ((*ops->init)(num))
+#define VMM_INIT() ((*ops->init)())
#define VMM_CLEANUP() ((*ops->cleanup)())
#define VMM_RESUME() ((*ops->resume)())
-#define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap))
-#define VMRUN(vmi, vcpu, rip, pmap) \
- ((*ops->vmrun)(vmi, vcpu, rip, pmap))
+#define VMINIT(vm) ((*ops->vminit)(vm))
+#define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip))
#define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
-#define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max))
-#define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace))
#define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
#define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val))
@@ -265,9 +260,6 @@ SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
*/
static int halt_detection_enabled = 1;
-/* IPI vector used for vcpu notifications */
-static int vmm_ipinum;
-
/* Trap into hypervisor on all guest exceptions and reflect them back */
static int trace_guest_exceptions;
@@ -319,6 +311,8 @@ vcpu_cleanup(struct vm *vm, int i, bool destroy)
fpu_save_area_free(vcpu->guestfpu);
vie_free(vcpu->vie_ctx);
vcpu->vie_ctx = NULL;
+ vmc_destroy(vcpu->vmclient);
+ vcpu->vmclient = NULL;
}
}
@@ -397,25 +391,19 @@ vm_vie_ctx(struct vm *vm, int cpuid)
static int
vmm_init(void)
{
- int error;
-
vmm_host_state_init();
- /* We use cpu_poke() for IPIs */
- vmm_ipinum = 0;
-
- error = vmm_mem_init();
- if (error)
- return (error);
-
- if (vmm_is_intel())
+ if (vmm_is_intel()) {
ops = &vmm_ops_intel;
- else if (vmm_is_svm())
+ pte_ops = &ept_pte_ops;
+ } else if (vmm_is_svm()) {
ops = &vmm_ops_amd;
- else
+ pte_ops = &rvi_pte_ops;
+ } else {
return (ENXIO);
+ }
- return (VMM_INIT(vmm_ipinum));
+ return (VMM_INIT());
}
int
@@ -453,7 +441,7 @@ vm_init(struct vm *vm, bool create)
{
int i;
- vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
+ vm->cookie = VMINIT(vm);
vm->iommu = NULL;
vm->vioapic = vioapic_init(vm);
vm->vhpet = vhpet_init(vm);
@@ -492,6 +480,12 @@ vm_init(struct vm *vm, bool create)
uint_t cores_per_package = 1;
uint_t threads_per_core = 1;
+/*
+ * Debugging tunable to enable dirty-page-tracking.
+ * (Remains off by default for now)
+ */
+bool gpt_track_dirty = false;
+
int
vm_create(const char *name, uint64_t flags, struct vm **retvm)
{
@@ -508,14 +502,18 @@ vm_create(const char *name, uint64_t flags, struct vm **retvm)
/* Name validation has already occurred */
VERIFY3U(strnlen(name, VM_MAX_NAMELEN), <, VM_MAX_NAMELEN);
- vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
+ vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty);
if (vmspace == NULL)
return (ENOMEM);
vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
+
vm->vmspace = vmspace;
vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0;
+ for (uint_t i = 0; i < VM_MAXCPU; i++) {
+ vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace);
+ }
vm->sockets = 1;
vm->cores = cores_per_package; /* XXX backwards compatibility */
@@ -621,7 +619,7 @@ vm_cleanup(struct vm *vm, bool destroy)
for (i = 0; i < VM_MAX_MEMSEGS; i++)
vm_free_memseg(vm, i);
- VMSPACE_FREE(vm->vmspace);
+ vmspace_destroy(vm->vmspace);
vm->vmspace = NULL;
}
}
@@ -681,7 +679,7 @@ vm_name(struct vm *vm)
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
- vm_object_t obj;
+ vm_object_t *obj;
if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
return (ENOMEM);
@@ -692,7 +690,7 @@ vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
int
vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
{
- return (vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len));
+ return (vmspace_unmap(vm->vmspace, gpa, gpa + len));
}
/*
@@ -730,7 +728,7 @@ int
vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
{
struct mem_seg *seg;
- vm_object_t obj;
+ vm_object_t *obj;
if (ident < 0 || ident >= VM_MAX_MEMSEGS)
return (EINVAL);
@@ -746,8 +744,7 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
return (EINVAL);
}
- obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT,
- vm->mem_transient);
+ obj = vm_object_mem_allocate(len, vm->mem_transient);
if (obj == NULL)
return (ENOMEM);
@@ -759,7 +756,7 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
int
vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
- vm_object_t *objptr)
+ vm_object_t **objptr)
{
struct mem_seg *seg;
@@ -786,7 +783,7 @@ vm_free_memseg(struct vm *vm, int ident)
seg = &vm->mem_segs[ident];
if (seg->object != NULL) {
- vm_object_deallocate(seg->object);
+ vm_object_release(seg->object);
bzero(seg, sizeof (struct mem_seg));
}
}
@@ -832,18 +829,16 @@ vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
if (map == NULL)
return (ENOSPC);
- error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
- len, 0, VMFS_NO_SPACE, prot, prot, 0);
+ error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot);
if (error != 0)
return (EFAULT);
vm_object_reference(seg->object);
if ((flags & VM_MEMMAP_F_WIRED) != 0) {
- error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
- VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+ error = vmspace_populate(vm->vmspace, gpa, gpa + len);
if (error != 0) {
- vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
+ vmspace_unmap(vm->vmspace, gpa, gpa + len);
return (EFAULT);
}
}
@@ -917,9 +912,9 @@ vm_free_memmap(struct vm *vm, int ident)
mm = &vm->mem_maps[ident];
if (mm->len) {
- error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
+ error = vmspace_unmap(vm->vmspace, mm->gpa,
mm->gpa + mm->len);
- KASSERT(error == 0, ("%s: vm_map_remove error %d",
+ KASSERT(error == 0, ("%s: vmspace_unmap error %d",
__func__, error));
bzero(mm, sizeof (struct mem_map));
}
@@ -961,12 +956,14 @@ vm_iommu_modify(struct vm *vm, bool map)
struct mem_map *mm;
#ifdef __FreeBSD__
void *vp, *cookie, *host_domain;
-#else
- void *vp, *cookie, *host_domain __unused;
#endif
+ vm_client_t *vmc;
sz = PAGE_SIZE;
+#ifdef __FreeBSD__
host_domain = iommu_host_domain();
+#endif
+ vmc = vmspace_client_alloc(vm->vmspace);
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
@@ -991,14 +988,13 @@ vm_iommu_modify(struct vm *vm, bool map)
gpa = mm->gpa;
while (gpa < mm->gpa + mm->len) {
- vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, PROT_WRITE,
- &cookie);
- KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
- vm_name(vm), gpa));
+ vm_page_t *vmp;
- vm_gpa_release(cookie);
+ vmp = vmc_hold(vmc, gpa, PROT_WRITE);
+ ASSERT(vmp != NULL);
+ hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT);
+ vmp_release(vmp);
- hpa = DMAP_TO_PHYS((uintptr_t)vp);
if (map) {
iommu_create_mapping(vm->iommu, gpa, hpa, sz);
#ifdef __FreeBSD__
@@ -1014,6 +1010,7 @@ vm_iommu_modify(struct vm *vm, bool map)
gpa += PAGE_SIZE;
}
}
+ vmc_destroy(vmc);
/*
* Invalidate the cached translations associated with the domain
@@ -1029,9 +1026,6 @@ vm_iommu_modify(struct vm *vm, bool map)
#endif
}
-#define vm_iommu_unmap(vm) vm_iommu_modify((vm), false)
-#define vm_iommu_map(vm) vm_iommu_modify((vm), true)
-
int
vm_unassign_pptdev(struct vm *vm, int pptfd)
{
@@ -1042,7 +1036,7 @@ vm_unassign_pptdev(struct vm *vm, int pptfd)
return (error);
if (ppt_assigned_devices(vm) == 0)
- vm_iommu_unmap(vm);
+ vm_iommu_modify(vm, false);
return (0);
}
@@ -1061,71 +1055,13 @@ vm_assign_pptdev(struct vm *vm, int pptfd)
vm->iommu = iommu_create_domain(maxaddr);
if (vm->iommu == NULL)
return (ENXIO);
- vm_iommu_map(vm);
+ vm_iommu_modify(vm, true);
}
error = ppt_assign_device(vm, pptfd);
return (error);
}
-void *
-vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
- void **cookie)
-{
- int i, count, pageoff;
- struct mem_map *mm;
- vm_page_t m;
-#ifdef INVARIANTS
- /*
- * All vcpus are frozen by ioctls that modify the memory map
- * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
- * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
- */
- int state;
- KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
- __func__, vcpuid));
- for (i = 0; i < vm->maxcpus; i++) {
- if (vcpuid != -1 && vcpuid != i)
- continue;
- state = vcpu_get_state(vm, i, NULL);
- KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
- __func__, state));
- }
-#endif
- pageoff = gpa & PAGE_MASK;
- if (len > PAGE_SIZE - pageoff)
- panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
-
- count = 0;
- for (i = 0; i < VM_MAX_MEMMAPS; i++) {
- mm = &vm->mem_maps[i];
- if (mm->len == 0) {
- continue;
- }
- if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
- count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
- trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
- break;
- }
- }
-
- if (count == 1) {
- *cookie = m;
- return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
- } else {
- *cookie = NULL;
- return (NULL);
- }
-}
-
-void
-vm_gpa_release(void *cookie)
-{
- vm_page_t m = cookie;
-
- vm_page_unwire(m, PQ_ACTIVE);
-}
-
int
vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
{
@@ -1478,13 +1414,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
static int
vm_handle_paging(struct vm *vm, int vcpuid)
{
+ struct vcpu *vcpu = &vm->vcpu[vcpuid];
+ vm_client_t *vmc = vcpu->vmclient;
+ struct vm_exit *vme = &vcpu->exitinfo;
int rv, ftype;
- struct vm_map *map;
- struct vcpu *vcpu;
- struct vm_exit *vme;
-
- vcpu = &vm->vcpu[vcpuid];
- vme = &vcpu->exitinfo;
KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
__func__, vme->inst_length));
@@ -1494,26 +1427,13 @@ vm_handle_paging(struct vm *vm, int vcpuid)
ftype == PROT_WRITE || ftype == PROT_EXEC,
("vm_handle_paging: invalid fault_type %d", ftype));
- if (ftype == PROT_READ || ftype == PROT_WRITE) {
- rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
- vme->u.paging.gpa, ftype);
- if (rv == 0) {
- VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
- ftype == PROT_READ ? "accessed" : "dirty",
- vme->u.paging.gpa);
- goto done;
- }
- }
-
- map = &vm->vmspace->vm_map;
- rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
+ rv = vmc_fault(vmc, vme->u.paging.gpa, ftype);
VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
"ftype = %d", rv, vme->u.paging.gpa, ftype);
if (rv != 0)
return (EFAULT);
-done:
return (0);
}
@@ -2221,7 +2141,6 @@ vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
struct vcpu *vcpu;
struct vm_exit *vme;
bool intr_disabled;
- pmap_t pmap;
vm_thread_ctx_t vtc;
int affinity_type = CPU_CURRENT;
@@ -2230,7 +2149,6 @@ vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
if (!CPU_ISSET(vcpuid, &vm->active_cpus))
return (EINVAL);
- pmap = vmspace_pmap(vm->vmspace);
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
@@ -2266,9 +2184,6 @@ restart:
affinity_type = CPU_CURRENT;
critical_enter();
- KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
- ("vm_run: absurd pm_active"));
-
/* Force a trip through update_sregs to reload %fs/%gs and friends */
PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
@@ -2279,7 +2194,7 @@ restart:
vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
- error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
+ error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip);
vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
/*
@@ -3355,10 +3270,9 @@ vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
if (hostcpu != curcpu) {
if (ntype == VCPU_NOTIFY_APIC) {
- vlapic_post_intr(vcpu->vlapic, hostcpu,
- vmm_ipinum);
+ vlapic_post_intr(vcpu->vlapic, hostcpu);
} else {
- ipi_cpu(hostcpu, vmm_ipinum);
+ poke_cpu(hostcpu);
}
} else {
/*
@@ -3427,6 +3341,12 @@ vm_get_vmspace(struct vm *vm)
return (vm->vmspace);
}
+struct vm_client *
+vm_get_vmclient(struct vm *vm, int vcpuid)
+{
+ return (vm->vcpu[vcpuid].vmclient);
+}
+
int
vm_apicid2vcpuid(struct vm *vm, int apicid)
{
@@ -3481,13 +3401,12 @@ vm_segment_name(int seg)
void
vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
- int num_copyinfo)
+ uint_t num_copyinfo)
{
- int idx;
-
- for (idx = 0; idx < num_copyinfo; idx++) {
- if (copyinfo[idx].cookie != NULL)
- vm_gpa_release(copyinfo[idx].cookie);
+ for (uint_t idx = 0; idx < num_copyinfo; idx++) {
+ if (copyinfo[idx].cookie != NULL) {
+ vmp_release((vm_page_t *)copyinfo[idx].cookie);
+ }
}
bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
}
@@ -3495,24 +3414,26 @@ vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
int
vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
- int num_copyinfo, int *fault)
+ uint_t num_copyinfo, int *fault)
{
- int error, idx, nused;
+ uint_t idx, nused;
size_t n, off, remaining;
- void *hva, *cookie;
- uint64_t gpa;
+ vm_client_t *vmc = vm_get_vmclient(vm, vcpuid);
bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
nused = 0;
remaining = len;
while (remaining > 0) {
+ uint64_t gpa;
+ int error;
+
KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
if (error || *fault)
return (error);
- off = gpa & PAGE_MASK;
- n = min(remaining, PAGE_SIZE - off);
+ off = gpa & PAGEOFFSET;
+ n = min(remaining, PAGESIZE - off);
copyinfo[nused].gpa = gpa;
copyinfo[nused].len = n;
remaining -= n;
@@ -3521,12 +3442,21 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
}
for (idx = 0; idx < nused; idx++) {
- hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
- copyinfo[idx].len, prot, &cookie);
- if (hva == NULL)
+ vm_page_t *vmp;
+ caddr_t hva;
+
+ vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot);
+ if (vmp == NULL) {
break;
- copyinfo[idx].hva = hva;
- copyinfo[idx].cookie = cookie;
+ }
+ if ((prot & PROT_WRITE) != 0) {
+ hva = (caddr_t)vmp_get_writable(vmp);
+ } else {
+ hva = (caddr_t)vmp_get_readable(vmp);
+ }
+ copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET);
+ copyinfo[idx].cookie = vmp;
+ copyinfo[idx].prot = prot;
}
if (idx != nused) {
@@ -3548,6 +3478,8 @@ vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
dst = kaddr;
idx = 0;
while (len > 0) {
+ ASSERT(copyinfo[idx].prot & PROT_READ);
+
bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
len -= copyinfo[idx].len;
dst += copyinfo[idx].len;
@@ -3565,6 +3497,8 @@ vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
src = kaddr;
idx = 0;
while (len > 0) {
+ ASSERT(copyinfo[idx].prot & PROT_WRITE);
+
bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
len -= copyinfo[idx].len;
src += copyinfo[idx].len;
@@ -3577,30 +3511,17 @@ vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
* these are global stats, only return the values with for vCPU 0
*/
VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
-VMM_STAT_DECLARE(VMM_MEM_WIRED);
static void
vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
{
-
if (vcpu == 0) {
vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
PAGE_SIZE * vmspace_resident_count(vm->vmspace));
}
}
-static void
-vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
-{
-
- if (vcpu == 0) {
- vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
- PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
- }
-}
-
VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
-VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
int
vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_gpt.c b/usr/src/uts/i86pc/io/vmm/vmm_gpt.c
index 9f6cc44aac..146ad958a8 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_gpt.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_gpt.c
@@ -92,7 +92,8 @@ struct vmm_gpt_node {
vmm_gpt_node_t *vgn_children;
vmm_gpt_node_t *vgn_siblings;
uint64_t *vgn_entries;
- uint64_t _vgn_pad[2];
+ uint64_t vgn_gpa;
+ uint64_t _vgn_pad;
};
/*
@@ -107,7 +108,6 @@ struct vmm_gpt_node {
struct vmm_gpt {
vmm_gpt_node_t *vgpt_root;
vmm_pte_ops_t *vgpt_pte_ops;
- uint64_t vgpt_mapped_page_count;
};
/*
@@ -153,24 +153,6 @@ vmm_gpt_alloc(vmm_pte_ops_t *pte_ops)
}
/*
- * Retrieves the host kernel address of the GPT root.
- */
-void *
-vmm_gpt_root_kaddr(vmm_gpt_t *gpt)
-{
- return (gpt->vgpt_root->vgn_entries);
-}
-
-/*
- * Retrieves the host PFN of the GPT root.
- */
-uint64_t
-vmm_gpt_root_pfn(vmm_gpt_t *gpt)
-{
- return (gpt->vgpt_root->vgn_host_pfn);
-}
-
-/*
* Frees the given node, first nulling out all of its links to other nodes in
* the tree, adjusting its parents reference count, and unlinking itself from
* its parents page table.
@@ -310,11 +292,18 @@ vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child,
ASSERT(gpt->vgpt_pte_ops != NULL);
ASSERT(parent != NULL);
ASSERT(child != NULL);
+ ASSERT3U(parent->vgn_level, <, LEVEL1);
+ const uint64_t gpa_mask[3] = {
+ [LEVEL4] = 0xffffff8000000000ul, /* entries cover 512G */
+ [LEVEL3] = 0xffffffffc0000000ul, /* entries cover 1G */
+ [LEVEL2] = 0xffffffffffe00000ul, /* entries cover 2M */
+ };
const int index = vmm_gpt_node_index(gpa, parent->vgn_level);
child->vgn_index = index;
child->vgn_level = parent->vgn_level + 1;
child->vgn_parent = parent;
+ child->vgn_gpa = gpa & gpa_mask[parent->vgn_level];
parent_entries = parent->vgn_entries;
entry = gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn);
parent_entries[index] = entry;
@@ -338,12 +327,14 @@ vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child,
* that this does not actually map the entry, but simply ensures that the
* entries exist.
*/
-void
+static void
vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa)
{
vmm_gpt_node_t *node, *child;
ASSERT(gpt != NULL);
+ ASSERT0(gpa & PAGEOFFSET);
+
node = gpt->vgpt_root;
for (uint_t i = 0; i < LEVEL1; i++) {
ASSERT(node != NULL);
@@ -364,41 +355,53 @@ vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa)
void
vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
{
+ ASSERT0(start & PAGEOFFSET);
+ ASSERT0(end & PAGEOFFSET);
+
for (uint64_t page = start; page < end; page += PAGESIZE) {
vmm_gpt_populate_entry(gpt, page);
}
}
/*
- * Inserts an entry for a given GPA into the table. The caller must
- * ensure that the entry is not currently mapped, though note that this
- * can race with another thread inserting the same page into the tree.
- * If we lose the race, we ensure that the page we thought we were
- * inserting is the page that was inserted.
+ * Format a PTE and install it in the provided PTE-pointer.
*/
bool
-vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr)
+vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot,
+ uint8_t attr)
{
- uint64_t *entries[MAX_GPT_LEVEL], entry, old_entry;
-
- ASSERT(gpt != NULL);
- vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
- ASSERT(entries[LEVEL1] != NULL);
+ uint64_t entry, old_entry;
entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr);
- old_entry = atomic_cas_64(entries[LEVEL1], 0, entry);
+ old_entry = atomic_cas_64(ptep, 0, entry);
if (old_entry != 0) {
- ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry),
- ==,
+ ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==,
gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry));
return (false);
}
- gpt->vgpt_mapped_page_count++;
return (true);
}
/*
+ * Inserts an entry for a given GPA into the table. The caller must
+ * ensure that a conflicting PFN is not mapped at the requested location.
+ * Racing operations to map the same PFN at one location is acceptable and
+ * properly handled.
+ */
+bool
+vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr)
+{
+ uint64_t *entries[MAX_GPT_LEVEL];
+
+ ASSERT(gpt != NULL);
+ vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
+ ASSERT(entries[LEVEL1] != NULL);
+
+ return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr));
+}
+
+/*
* Removes a child node from its parent's list of children, and then frees
* the now-orphaned child.
*/
@@ -421,9 +424,8 @@ vmm_gpt_node_remove_child(vmm_gpt_node_t *parent, vmm_gpt_node_t *child)
}
/*
- * Cleans up unused inner nodes in the GPT. Asserts that the
- * leaf corresponding to the entry does not map any additional
- * pages.
+ * Cleans up unused inner nodes in the GPT. Asserts that the leaf corresponding
+ * to the entry does not map any additional pages.
*/
static void
vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa)
@@ -450,27 +452,28 @@ vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa)
}
/*
- * Cleans up the unused inner nodes in the GPT for a region of guest
- * physical address space bounded by [start..end). The region must
- * map no pages.
+ * Cleans up the unused inner nodes in the GPT for a region of guest physical
+ * address space of [start, end). The region must map no pages.
*/
void
vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
{
+ ASSERT0(start & PAGEOFFSET);
+ ASSERT0(end & PAGEOFFSET);
+
for (uint64_t page = start; page < end; page += PAGESIZE) {
vmm_gpt_vacate_entry(gpt, page);
}
}
/*
- * Remove a mapping from the table. Returns false if the page was not
- * mapped, otherwise returns true.
+ * Remove a mapping from the table. Returns false if the page was not mapped,
+ * otherwise returns true.
*/
bool
vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
{
uint64_t *entries[MAX_GPT_LEVEL], entry;
- bool was_mapped;
ASSERT(gpt != NULL);
vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
@@ -479,28 +482,27 @@ vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
entry = *entries[LEVEL1];
*entries[LEVEL1] = 0;
- was_mapped = gpt->vgpt_pte_ops->vpeo_pte_is_present(entry);
- if (was_mapped)
- gpt->vgpt_mapped_page_count--;
-
- return (was_mapped);
+ return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry));
}
/*
- * Un-maps the region of guest physical address space bounded by
- * [start..end). Returns the number of pages that are unmapped.
+ * Un-maps the region of guest physical address space bounded by [start..end).
+ * Returns the number of pages that are unmapped.
*/
size_t
vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
{
- size_t n = 0;
+ ASSERT0(start & PAGEOFFSET);
+ ASSERT0(end & PAGEOFFSET);
+ size_t num_unmapped = 0;
for (uint64_t page = start; page < end; page += PAGESIZE) {
- if (vmm_gpt_unmap(gpt, page) != 0)
- n++;
+ if (vmm_gpt_unmap(gpt, page) != 0) {
+ num_unmapped++;
+ }
}
- return (n);
+ return (num_unmapped);
}
/*
@@ -509,31 +511,23 @@ vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
* bits of the entry. Otherwise, it will be ignored.
*/
bool
-vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t gpa, uint_t *protp)
+vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp)
{
- uint64_t *entries[MAX_GPT_LEVEL], entry;
+ uint64_t entry;
- vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
- if (entries[LEVEL1] == NULL)
+ if (ptep == NULL) {
return (false);
- entry = *entries[LEVEL1];
- if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry))
+ }
+ entry = *ptep;
+ if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
return (false);
+ }
+ *pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
*protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry);
-
return (true);
}
/*
- * Returns the number of pages that are mapped in by this GPT.
- */
-size_t
-vmm_gpt_mapped_count(vmm_gpt_t *gpt)
-{
- return (gpt->vgpt_mapped_page_count);
-}
-
-/*
* Resets the accessed bit on the page table entry pointed to be `entry`.
* If `on` is true, the bit will be set, otherwise it will be cleared.
* The old value of the bit is returned.
@@ -556,3 +550,12 @@ vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on)
ASSERT(entry != NULL);
return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on));
}
+
+/*
+ * Get properly formatted PML4 (EPTP/nCR3) for GPT.
+ */
+uint64_t
+vmm_gpt_get_pmtp(vmm_gpt_t *gpt)
+{
+ return (gpt->vgpt_pte_ops->vpeo_get_pmtp(gpt->vgpt_root->vgn_host_pfn));
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
index 1dc2616599..d2a790ec03 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
@@ -373,6 +373,27 @@ static const struct vie_op one_byte_opcodes[256] = {
#define GB (1024 * 1024 * 1024)
+
+/*
+ * Paging defines, previously pulled in from machine/pmap.h
+ */
+#define PG_V (1 << 0) /* Present */
+#define PG_RW (1 << 1) /* Read/Write */
+#define PG_U (1 << 2) /* User/Supervisor */
+#define PG_A (1 << 5) /* Accessed */
+#define PG_M (1 << 6) /* Dirty */
+#define PG_PS (1 << 7) /* Largepage */
+
+/*
+ * Paging except defines, previously pulled in from machine/pmap.h
+ */
+#define PGEX_P (1 << 0) /* Non-present/Protection */
+#define PGEX_W (1 << 1) /* Read/Write */
+#define PGEX_U (1 << 2) /* User/Supervisor */
+#define PGEX_RSV (1 << 3) /* (Non-)Reserved */
+#define PGEX_I (1 << 4) /* Instruction */
+
+
static enum vm_reg_name gpr_map[16] = {
VM_REG_GUEST_RAX,
VM_REG_GUEST_RCX,
@@ -2875,43 +2896,48 @@ pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
}
static void
-ptp_release(void **cookie)
+ptp_release(vm_page_t **vmp)
{
- if (*cookie != NULL) {
- vm_gpa_release(*cookie);
- *cookie = NULL;
+ if (*vmp != NULL) {
+ vmp_release(*vmp);
+ *vmp = NULL;
}
}
static void *
-ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
+ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp)
{
- void *ptr;
+ vm_client_t *vmc = vm_get_vmclient(vm, vcpu);
+ const uintptr_t hold_gpa = gpa & PAGEMASK;
+
+ /* Hold must not cross a page boundary */
+ VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE);
- ptp_release(cookie);
- ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, PROT_READ | PROT_WRITE,
- cookie);
+ if (*vmp != NULL) {
+ vmp_release(*vmp);
+ }
+
+ *vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE);
+ if (*vmp == NULL) {
+ return (NULL);
+ }
- return (ptr);
+ return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa));
}
static int
_vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
{
- int nlevels, pfcode, retval, usermode, writable;
+ int nlevels, pfcode;
int ptpshift = 0, ptpindex = 0;
uint64_t ptpphys;
uint64_t *ptpbase = NULL, pte = 0, pgsize = 0;
- uint32_t *ptpbase32, pte32;
- void *cookie;
+ vm_page_t *cookie = NULL;
+ const bool usermode = paging->cpl == 3;
+ const bool writable = (prot & PROT_WRITE) != 0;
*guest_fault = 0;
-
- usermode = (paging->cpl == 3 ? 1 : 0);
- writable = prot & PROT_WRITE;
- cookie = NULL;
- retval = 0;
restart:
ptpphys = paging->cr3; /* root of the page tables */
ptp_release(&cookie);
@@ -2923,15 +2949,18 @@ restart:
*/
if (!check_only)
vm_inject_gp(vm, vcpuid);
- goto fault;
+ *guest_fault = 1;
+ return (0);
}
if (paging->paging_mode == PAGING_MODE_FLAT) {
*gpa = gla;
- goto done;
+ return (0);
}
if (paging->paging_mode == PAGING_MODE_32) {
+ uint32_t *ptpbase32, pte32;
+
nlevels = 2;
while (--nlevels >= 0) {
/* Zero out the lower 12 bits. */
@@ -2940,8 +2969,9 @@ restart:
ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
&cookie);
- if (ptpbase32 == NULL)
- goto error;
+ if (ptpbase32 == NULL) {
+ return (EFAULT);
+ }
ptpshift = PAGE_SHIFT + nlevels * 10;
ptpindex = (gla >> ptpshift) & 0x3FF;
@@ -2957,7 +2987,10 @@ restart:
0, pte32);
vm_inject_pf(vm, vcpuid, pfcode, gla);
}
- goto fault;
+
+ ptp_release(&cookie);
+ *guest_fault = 1;
+ return (0);
}
/*
@@ -2992,7 +3025,8 @@ restart:
/* Zero out the lower 'ptpshift' bits */
pte32 >>= ptpshift; pte32 <<= ptpshift;
*gpa = pte32 | (gla & (pgsize - 1));
- goto done;
+ ptp_release(&cookie);
+ return (0);
}
if (paging->paging_mode == PAGING_MODE_PAE) {
@@ -3001,8 +3035,9 @@ restart:
ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4,
&cookie);
- if (ptpbase == NULL)
- goto error;
+ if (ptpbase == NULL) {
+ return (EFAULT);
+ }
ptpindex = (gla >> 30) & 0x3;
@@ -3013,21 +3048,27 @@ restart:
pfcode = pf_error_code(usermode, prot, 0, pte);
vm_inject_pf(vm, vcpuid, pfcode, gla);
}
- goto fault;
+
+ ptp_release(&cookie);
+ *guest_fault = 1;
+ return (0);
}
ptpphys = pte;
nlevels = 2;
- } else
+ } else {
nlevels = 4;
+ }
+
while (--nlevels >= 0) {
/* Zero out the lower 12 bits and the upper 12 bits */
- ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
+ ptpphys &= 0x000ffffffffff000UL;
ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
- if (ptpbase == NULL)
- goto error;
+ if (ptpbase == NULL) {
+ return (EFAULT);
+ }
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gla >> ptpshift) & 0x1FF;
@@ -3042,7 +3083,10 @@ restart:
pfcode = pf_error_code(usermode, prot, 0, pte);
vm_inject_pf(vm, vcpuid, pfcode, gla);
}
- goto fault;
+
+ ptp_release(&cookie);
+ *guest_fault = 1;
+ return (0);
}
/* Set the accessed bit in the page table entry */
@@ -3060,7 +3104,10 @@ restart:
1, pte);
vm_inject_pf(vm, vcpuid, pfcode, gla);
}
- goto fault;
+
+ ptp_release(&cookie);
+ *guest_fault = 1;
+ return (0);
}
break;
}
@@ -3073,21 +3120,12 @@ restart:
if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
goto restart;
}
+ ptp_release(&cookie);
/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
*gpa = pte | (gla & (pgsize - 1));
-done:
- ptp_release(&cookie);
- KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
- __func__, retval));
- return (retval);
-error:
- retval = EFAULT;
- goto done;
-fault:
- *guest_fault = 1;
- goto done;
+ return (0);
}
int
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
index a5118c15af..e95f444051 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
@@ -46,7 +46,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/smp.h>
+#include <sys/cpuset.h>
#include <x86/specialreg.h>
#include <x86/apicreg.h>
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_mem.c
deleted file mode 100644
index 4ffe5bf509..0000000000
--- a/usr/src/uts/i86pc/io/vmm/vmm_mem.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mman.h>
-#include <sys/sglist.h>
-#include <sys/lock.h>
-#include <sys/rwlock.h>
-
-#include <machine/md_var.h>
-#include <machine/vm.h>
-#include <sys/vmm_vm.h>
-
-#include "vmm_mem.h"
-
-int
-vmm_mem_init(void)
-{
-
- return (0);
-}
-
-vm_object_t
-vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
- vm_paddr_t hpa)
-{
- int error;
- vm_object_t obj;
- struct sglist *sg;
-
- sg = sglist_alloc(1, M_WAITOK);
- error = sglist_append_phys(sg, hpa, len);
- KASSERT(error == 0, ("error %d appending physaddr to sglist", error));
-
- const int prot = PROT_READ | PROT_WRITE;
- obj = vm_pager_allocate(OBJT_SG, sg, len, prot, 0, NULL);
- if (obj != NULL) {
- /*
- * VT-x ignores the MTRR settings when figuring out the
- * memory type for translations obtained through EPT.
- *
- * Therefore we explicitly force the pages provided by
- * this object to be mapped as uncacheable.
- */
- VM_OBJECT_WLOCK(obj);
- error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
- VM_OBJECT_WUNLOCK(obj);
- if (error != 0) {
- panic("vmm_mmio_alloc: vm_object_set_memattr error %d",
- error);
- }
- error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
- VMFS_NO_SPACE, prot, prot, 0);
- if (error != 0) {
- vm_object_deallocate(obj);
- obj = NULL;
- }
- }
-
- /*
- * Drop the reference on the sglist.
- *
- * If the scatter/gather object was successfully allocated then it
- * has incremented the reference count on the sglist. Dropping the
- * initial reference count ensures that the sglist will be freed
- * when the object is deallocated.
- *
- * If the object could not be allocated then we end up freeing the
- * sglist.
- */
- sglist_free(sg);
-
- return (obj);
-}
-
-vm_paddr_t
-vmm_mem_maxaddr(void)
-{
-
- return (ptoa(Maxmem));
-}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.h b/usr/src/uts/i86pc/io/vmm/vmm_mem.h
deleted file mode 100644
index b27501eef2..0000000000
--- a/usr/src/uts/i86pc/io/vmm/vmm_mem.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * Copyright 2013 Pluribus Networks Inc.
- */
-
-#ifndef _VMM_MEM_H_
-#define _VMM_MEM_H_
-
-struct vmspace;
-struct vm_object;
-
-int vmm_mem_init(void);
-struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
- vm_paddr_t hpa);
-vm_paddr_t vmm_mem_maxaddr(void);
-
-#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
index 92d1494e04..823097b285 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -100,6 +100,7 @@ struct vmm_hold {
struct vmm_lease {
list_node_t vml_node;
struct vm *vml_vm;
+ vm_client_t *vml_vmclient;
boolean_t vml_expired;
boolean_t vml_break_deferred;
boolean_t (*vml_expire_func)(void *);
@@ -444,7 +445,6 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
lock_type = LOCK_WRITE_HOLD;
break;
- case VM_GET_GPA_PMAP:
case VM_GET_MEMSEG:
case VM_MMAP_GETNEXT:
case VM_LAPIC_IRQ:
@@ -465,6 +465,7 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
lock_type = LOCK_READ_HOLD;
break;
+ case VM_GET_GPA_PMAP:
case VM_IOAPIC_PINCOUNT:
case VM_SUSPEND:
default:
@@ -1127,18 +1128,11 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
break;
}
case VM_GET_GPA_PMAP: {
- struct vm_gpa_pte gpapte;
-
- if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
- error = EFAULT;
- break;
- }
-#ifdef __FreeBSD__
- /* XXXJOY: add function? */
- pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
- gpapte.gpa, gpapte.pte, &gpapte.ptenum);
-#endif
- error = 0;
+ /*
+ * Until there is a necessity to leak EPT/RVI PTE values to
+ * userspace, this will remain unimplemented
+ */
+ error = EINVAL;
break;
}
case VM_GET_HPET_CAPABILITIES: {
@@ -1690,6 +1684,7 @@ vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
lease->vml_hold = hold;
/* cache the VM pointer for one less pointer chase */
lease->vml_vm = sc->vmm_vm;
+ lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
mutex_enter(&sc->vmm_lease_lock);
while (sc->vmm_lease_blocker != 0) {
@@ -1709,6 +1704,7 @@ vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
list_remove(&sc->vmm_lease_list, lease);
vmm_read_unlock(sc);
+ vmc_destroy(lease->vml_vmclient);
kmem_free(lease, sizeof (*lease));
}
@@ -1841,9 +1837,30 @@ vmm_drv_lease_expired(vmm_lease_t *lease)
void *
vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
{
+ vm_page_t *vmp;
+ void *res = NULL;
+
ASSERT(lease != NULL);
+ ASSERT3U(sz, ==, PAGESIZE);
+ ASSERT0(gpa & PAGEOFFSET);
+
+ vmp = vmc_hold(lease->vml_vmclient, gpa, PROT_READ | PROT_WRITE);
+ /*
+ * Break the rules for now and just extract the pointer. This is
+ * nominally safe, since holding a driver lease on the VM read-locks it.
+ *
+ * A pointer which would otherwise be at risk of being a use-after-free
+ * vector is made safe since actions such as vmspace_unmap() require
+ * acquisition of the VM write-lock, (causing all driver leases to be
+ * broken) allowing the consumers to cease their access prior to
+ * modification of the vmspace.
+ */
+ if (vmp != NULL) {
+ res = vmp_get_writable(vmp);
+ vmp_release(vmp);
+ }
- return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
+ return (res);
}
int
@@ -2191,6 +2208,14 @@ vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
minor_t minor;
vmm_softc_t *sc;
+ /*
+ * Forbid running bhyve in a 32-bit process until it has been tested and
+ * verified to be safe.
+ */
+ if (curproc->p_model != DATAMODEL_LP64) {
+ return (EFBIG);
+ }
+
minor = getminor(*devp);
if (minor == VMM_CTL_MINOR) {
/*
@@ -2330,6 +2355,14 @@ vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
vmm_softc_t *sc;
minor_t minor;
+ /*
+ * Forbid running bhyve in a 32-bit process until it has been tested and
+ * verified to be safe.
+ */
+ if (curproc->p_model != DATAMODEL_LP64) {
+ return (EFBIG);
+ }
+
/* The structs in bhyve ioctls assume a 64-bit datamodel */
if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
return (ENOTSUP);
@@ -2356,10 +2389,7 @@ vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
{
vmm_softc_t *sc;
const minor_t minor = getminor(dev);
- struct vm *vm;
int err;
- vm_object_t vmo = NULL;
- struct vmspace *vms;
if (minor == VMM_CTL_MINOR) {
return (ENODEV);
@@ -2380,31 +2410,23 @@ vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
/* Grab read lock on the VM to prevent any changes to the memory map */
vmm_read_lock(sc);
- vm = sc->vmm_vm;
- vms = vm_get_vmspace(vm);
if (off >= VM_DEVMEM_START) {
int segid;
- off_t map_off = 0;
+ off_t segoff;
/* Mapping a devmem "device" */
- if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
+ if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
err = ENODEV;
- goto out;
- }
- err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
- if (err != 0) {
- goto out;
+ } else {
+ err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
+ addrp, prot, maxprot, flags);
}
- err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
- flags);
} else {
/* Mapping a part of the guest physical space */
- err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
- flags);
+ err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
+ maxprot, flags);
}
-
-out:
vmm_read_unlock(sc);
return (err);
}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c
index 3d357f37d2..fde4a030ce 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c
@@ -21,17 +21,12 @@
#include <sys/kmem.h>
#include <sys/machsystm.h>
#include <sys/mman.h>
+#include <sys/x86_archext.h>
+#include <vm/hat_pte.h>
#include <sys/vmm_gpt.h>
#include <sys/vmm_vm.h>
-
-typedef struct ept_map ept_map_t;
-struct ept_map {
- vmm_gpt_t *em_gpt;
- kmutex_t em_lock;
-};
-
#define EPT_R (1 << 0)
#define EPT_W (1 << 1)
#define EPT_X (1 << 2)
@@ -42,6 +37,9 @@ struct ept_map {
#define EPT_PA_MASK (0x000ffffffffff000ull)
+#define EPT_MAX_LEVELS 4
+CTASSERT(EPT_MAX_LEVELS <= MAX_GPT_LEVEL);
+
CTASSERT(EPT_R == PROT_READ);
CTASSERT(EPT_W == PROT_WRITE);
CTASSERT(EPT_X == PROT_EXEC);
@@ -121,7 +119,15 @@ ept_reset_accessed(uint64_t *entry, bool on)
on ? EPT_ACCESSED : 0));
}
-static vmm_pte_ops_t ept_pte_ops = {
+static uint64_t
+ept_get_pmtp(pfn_t root_pfn)
+{
+ /* TODO: enable AD tracking when required */
+ return ((root_pfn << PAGESHIFT |
+ (EPT_MAX_LEVELS - 1) << 3 | MTRR_TYPE_WB));
+}
+
+vmm_pte_ops_t ept_pte_ops = {
.vpeo_map_table = ept_map_table,
.vpeo_map_page = ept_map_page,
.vpeo_pte_pfn = ept_pte_pfn,
@@ -129,100 +135,5 @@ static vmm_pte_ops_t ept_pte_ops = {
.vpeo_pte_prot = ept_pte_prot,
.vpeo_reset_dirty = ept_reset_dirty,
.vpeo_reset_accessed = ept_reset_accessed,
-};
-
-vmm_gpt_t *
-ept_create(void)
-{
- return (vmm_gpt_alloc(&ept_pte_ops));
-}
-
-static void *
-ept_ops_create(uintptr_t *root_kaddr)
-{
- ept_map_t *map;
-
- map = kmem_zalloc(sizeof (*map), KM_SLEEP);
- mutex_init(&map->em_lock, NULL, MUTEX_DEFAULT, NULL);
- map->em_gpt = ept_create();
- *root_kaddr = (uintptr_t)vmm_gpt_root_kaddr(map->em_gpt);
-
- return (map);
-}
-
-static void
-ept_ops_destroy(void *arg)
-{
- ept_map_t *map = arg;
-
- if (map != NULL) {
- vmm_gpt_free(map->em_gpt);
- mutex_destroy(&map->em_lock);
- kmem_free(map, sizeof (*map));
- }
-}
-
-static uint64_t
-ept_ops_wired_count(void *arg)
-{
- ept_map_t *map = arg;
- uint64_t res;
-
- mutex_enter(&map->em_lock);
- res = vmm_gpt_mapped_count(map->em_gpt);
- mutex_exit(&map->em_lock);
-
- return (res);
-}
-
-static int
-ept_ops_is_wired(void *arg, uint64_t gpa, uint_t *protp)
-{
- ept_map_t *map = arg;
- bool mapped;
-
- mutex_enter(&map->em_lock);
- mapped = vmm_gpt_is_mapped(map->em_gpt, gpa, protp);
- mutex_exit(&map->em_lock);
-
- return (mapped ? 0 : -1);
-}
-
-static int
-ept_ops_map(void *arg, uint64_t gpa, pfn_t pfn, uint_t _lvl, uint_t prot,
- uint8_t attr)
-{
- ept_map_t *map = arg;
-
- ASSERT((prot & EPT_RWX) != 0 && (prot & ~EPT_RWX) == 0);
-
- mutex_enter(&map->em_lock);
- vmm_gpt_populate_entry(map->em_gpt, gpa);
- (void) vmm_gpt_map(map->em_gpt, gpa, pfn, prot, attr);
- mutex_exit(&map->em_lock);
-
- return (0);
-}
-
-static uint64_t
-ept_ops_unmap(void *arg, uint64_t start, uint64_t end)
-{
- ept_map_t *map = arg;
- size_t unmapped = 0;
-
- mutex_enter(&map->em_lock);
- unmapped = vmm_gpt_unmap_region(map->em_gpt, start, end);
- vmm_gpt_vacate_region(map->em_gpt, start, end);
- mutex_exit(&map->em_lock);
-
- return ((uint64_t)unmapped);
-}
-
-struct vmm_pt_ops ept_ops = {
- .vpo_init = ept_ops_create,
- .vpo_free = ept_ops_destroy,
- .vpo_wired_cnt = ept_ops_wired_count,
- .vpo_is_wired = ept_ops_is_wired,
- .vpo_map = ept_ops_map,
- .vpo_unmap = ept_ops_unmap,
+ .vpeo_get_pmtp = ept_get_pmtp,
};
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
index afd686f197..f78db731d6 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
@@ -60,7 +60,6 @@
#include <machine/cpufunc.h>
#include <machine/fpu.h>
#include <machine/md_var.h>
-#include <machine/pmap.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
#include <machine/vmparam.h>
@@ -95,7 +94,7 @@ uint8_t const bin2bcd_data[] = {
};
void
-pmap_invalidate_cache(void)
+invalidate_cache_all(void)
{
cpuset_t cpuset;
@@ -108,7 +107,7 @@ pmap_invalidate_cache(void)
}
vm_paddr_t
-pmap_kextract(vm_offset_t va)
+vtophys(void *va)
{
pfn_t pfn;
@@ -411,18 +410,6 @@ vmm_glue_callout_localize(struct callout *c)
mutex_exit(&cpu_lock);
}
-void
-ipi_cpu(int cpu, uint_t ipi)
-{
- /*
- * This was previously implemented as an invocation of asynchronous
- * no-op crosscalls to interrupt the target CPU. Since even nowait
- * crosscalls can block in certain circumstances, a direct poke_cpu()
- * is safer when called from delicate contexts.
- */
- poke_cpu(cpu);
-}
-
uint_t cpu_high; /* Highest arg to CPUID */
uint_t cpu_exthigh; /* Highest arg to extended CPUID */
uint_t cpu_id; /* Stepping ID */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
index c66a4e7962..8b45782d25 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
@@ -28,12 +28,6 @@
#include <sys/vmm_gpt.h>
#include <sys/vmm_vm.h>
-typedef struct rvi_map rvi_map_t;
-struct rvi_map {
- vmm_gpt_t *rm_gpt;
- kmutex_t rm_lock;
-};
-
static inline uint64_t
rvi_prot(uint_t prot)
{
@@ -145,7 +139,13 @@ rvi_reset_accessed(uint64_t *entry, bool on)
return (rvi_reset_bits(entry, (PT_MOD | PT_REF), on ? PT_REF : 0));
}
-static vmm_pte_ops_t rvi_pte_ops = {
+static uint64_t
+rvi_get_pmtp(pfn_t root_pfn)
+{
+ return (root_pfn << PAGESHIFT);
+}
+
+vmm_pte_ops_t rvi_pte_ops = {
.vpeo_map_table = rvi_map_table,
.vpeo_map_page = rvi_map_page,
.vpeo_pte_pfn = rvi_pte_pfn,
@@ -153,101 +153,5 @@ static vmm_pte_ops_t rvi_pte_ops = {
.vpeo_pte_prot = rvi_pte_prot,
.vpeo_reset_dirty = rvi_reset_dirty,
.vpeo_reset_accessed = rvi_reset_accessed,
-};
-
-vmm_gpt_t *
-rvi_create(void)
-{
- return (vmm_gpt_alloc(&rvi_pte_ops));
-}
-
-static void *
-rvi_ops_create(uintptr_t *root_kaddr)
-{
- rvi_map_t *map;
-
- map = kmem_zalloc(sizeof (*map), KM_SLEEP);
- mutex_init(&map->rm_lock, NULL, MUTEX_DEFAULT, NULL);
- map->rm_gpt = rvi_create();
- *root_kaddr = (uintptr_t)vmm_gpt_root_kaddr(map->rm_gpt);
-
- return (map);
-}
-
-static void
-rvi_ops_destroy(void *arg)
-{
- rvi_map_t *map = arg;
-
- if (map != NULL) {
- vmm_gpt_free(map->rm_gpt);
- mutex_destroy(&map->rm_lock);
- kmem_free(map, sizeof (*map));
- }
-}
-
-static uint64_t
-rvi_ops_wired_count(void *arg)
-{
- rvi_map_t *map = arg;
- uint64_t res;
-
- mutex_enter(&map->rm_lock);
- res = vmm_gpt_mapped_count(map->rm_gpt);
- mutex_exit(&map->rm_lock);
-
- return (res);
-}
-
-static int
-rvi_ops_is_wired(void *arg, uint64_t gpa, uint_t *protp)
-{
- rvi_map_t *map = arg;
- bool mapped;
-
- mutex_enter(&map->rm_lock);
- mapped = vmm_gpt_is_mapped(map->rm_gpt, gpa, protp);
- mutex_exit(&map->rm_lock);
-
- return (mapped ? 0 : -1);
-}
-
-static int
-rvi_ops_map(void *arg, uint64_t gpa, pfn_t pfn, uint_t _lvl, uint_t prot,
- uint8_t attr)
-{
- rvi_map_t *map = arg;
-
- ASSERT((prot & PROT_READ) != 0);
- ASSERT3U((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)), ==, 0);
-
- mutex_enter(&map->rm_lock);
- vmm_gpt_populate_entry(map->rm_gpt, gpa);
- (void) vmm_gpt_map(map->rm_gpt, gpa, pfn, prot, attr);
- mutex_exit(&map->rm_lock);
-
- return (0);
-}
-
-static uint64_t
-rvi_ops_unmap(void *arg, uint64_t start, uint64_t end)
-{
- rvi_map_t *map = arg;
- size_t unmapped = 0;
-
- mutex_enter(&map->rm_lock);
- unmapped = vmm_gpt_unmap_region(map->rm_gpt, start, end);
- vmm_gpt_vacate_region(map->rm_gpt, start, end);
- mutex_exit(&map->rm_lock);
-
- return ((uint64_t)unmapped);
-}
-
-struct vmm_pt_ops rvi_ops = {
- .vpo_init = rvi_ops_create,
- .vpo_free = rvi_ops_destroy,
- .vpo_wired_cnt = rvi_ops_wired_count,
- .vpo_is_wired = rvi_ops_is_wired,
- .vpo_map = rvi_ops_map,
- .vpo_unmap = rvi_ops_unmap,
+ .vpeo_get_pmtp = rvi_get_pmtp,
};
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
deleted file mode 100644
index bd1f1890d4..0000000000
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
+++ /dev/null
@@ -1,932 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
-
-/*
- * Copyright 2019 Joyent, Inc.
- * Copyright 2021 Oxide Computer Company
- * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
- */
-
-#include <sys/param.h>
-#include <sys/kmem.h>
-#include <sys/thread.h>
-#include <sys/list.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/ddi.h>
-#include <sys/sysmacros.h>
-#include <sys/machsystm.h>
-#include <sys/vmsystm.h>
-#include <sys/malloc.h>
-#include <sys/x86_archext.h>
-#include <vm/as.h>
-#include <vm/hat_i86.h>
-#include <vm/seg_vn.h>
-#include <vm/seg_kmem.h>
-
-#include <machine/vm.h>
-#include <sys/vmm_gpt.h>
-#include <sys/vmm_vm.h>
-#include <sys/seg_vmm.h>
-#include <sys/vmm_reservoir.h>
-
-#define PMAP_TO_VMMAP(pm) ((vm_map_t) \
- ((caddr_t)(pm) - offsetof(struct vmspace, vms_pmap)))
-#define VMMAP_TO_VMSPACE(vmmap) ((struct vmspace *) \
- ((caddr_t)(vmmap) - offsetof(struct vmspace, vm_map)))
-
-
-struct vmspace_mapping {
- list_node_t vmsm_node;
- vm_object_t vmsm_object;
- uintptr_t vmsm_addr;
- size_t vmsm_len;
- off_t vmsm_offset;
- uint_t vmsm_prot;
-};
-typedef struct vmspace_mapping vmspace_mapping_t;
-
-#define VMSM_OFFSET(vmsm, addr) ( \
- (vmsm)->vmsm_offset + \
- ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
-
-
-/* Private glue interfaces */
-static void pmap_free(pmap_t);
-static vmspace_mapping_t *vm_mapping_find(struct vmspace *, uintptr_t, size_t,
- boolean_t);
-static void vm_mapping_remove(struct vmspace *, vmspace_mapping_t *);
-
-struct vmspace *
-vmspace_alloc(vm_offset_t start, vm_offset_t end, pmap_pinit_t pinit)
-{
- struct vmspace *vms;
- const uintptr_t size = end + 1;
-
- /*
- * This whole mess is built on the assumption that a 64-bit address
- * space is available to work with for the various pagetable tricks.
- */
- VERIFY(ttoproc(curthread)->p_model == DATAMODEL_LP64);
- VERIFY(start == 0 && size > 0 && (size & PAGEOFFSET) == 0 &&
- size <= (uintptr_t)USERLIMIT);
-
- vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
- vms->vms_size = size;
- list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
- offsetof(vmspace_mapping_t, vmsm_node));
-
- if (pinit(&vms->vms_pmap) == 0) {
- kmem_free(vms, sizeof (*vms));
- return (NULL);
- }
-
- return (vms);
-}
-
-void
-vmspace_free(struct vmspace *vms)
-{
- VERIFY(list_is_empty(&vms->vms_maplist));
-
- pmap_free(&vms->vms_pmap);
- kmem_free(vms, sizeof (*vms));
-}
-
-pmap_t
-vmspace_pmap(struct vmspace *vms)
-{
- return (&vms->vms_pmap);
-}
-
-long
-vmspace_resident_count(struct vmspace *vms)
-{
- /* XXXJOY: finish */
- return (0);
-}
-
-void *
-vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size)
-{
- vmspace_mapping_t *vmsm;
- void *result = NULL;
-
- /*
- * Since vmspace_find_kva is provided so that vmm_drv consumers can do
- * GPA2KVA translations, it is expected to be called when there is a
- * read lock preventing vmspace alterations. As such, it can do the
- * lockless vm_mapping_find() lookup.
- */
- vmsm = vm_mapping_find(vms, addr, size, B_TRUE);
- if (vmsm != NULL) {
- struct vm_object *vmo = vmsm->vmsm_object;
-
- switch (vmo->vmo_type) {
- case OBJT_DEFAULT:
- result = vmmr_region_mem_at(
- (vmmr_region_t *)vmo->vmo_data,
- VMSM_OFFSET(vmsm, addr) & PAGEMASK);
- break;
- default:
- break;
- }
- }
-
- return (result);
-}
-
-static int
-vmspace_pmap_iswired(struct vmspace *vms, uintptr_t addr, uint_t *prot)
-{
- pmap_t pmap = &vms->vms_pmap;
- int rv;
-
- ASSERT(MUTEX_HELD(&vms->vms_lock));
-
- rv = pmap->pm_ops->vpo_is_wired(pmap->pm_impl, addr, prot);
- return (rv);
-}
-
-static void
-pmap_free(pmap_t pmap)
-{
- void *pmi = pmap->pm_impl;
- struct vmm_pt_ops *ops = pmap->pm_ops;
-
- pmap->pm_pml4 = NULL;
- pmap->pm_impl = NULL;
- pmap->pm_ops = NULL;
-
- ops->vpo_free(pmi);
-}
-
-int
-pmap_pinit_type(pmap_t pmap, enum pmap_type type, int flags)
-{
- /* For use in vmm only */
- pmap->pm_type = type;
- switch (type) {
- case PT_EPT: {
- struct vmm_pt_ops *ops = &ept_ops;
- void *pml4, *pmi;
-
- pmi = ops->vpo_init((uintptr_t *)&pml4);
-
- pmap->pm_ops = ops;
- pmap->pm_impl = pmi;
- pmap->pm_pml4 = pml4;
- return (1);
- }
- case PT_RVI: {
- struct vmm_pt_ops *ops = &rvi_ops;
- void *pml4, *pmi;
-
- pmi = ops->vpo_init((uintptr_t *)&pml4);
-
- pmap->pm_ops = ops;
- pmap->pm_impl = pmi;
- pmap->pm_pml4 = pml4;
- return (1);
- }
- default:
- panic("unsupported pmap type: %x", type);
- break;
- }
-
- return (1);
-}
-
-long
-pmap_wired_count(pmap_t pmap)
-{
- long val;
-
- val = pmap->pm_ops->vpo_wired_cnt(pmap->pm_impl);
- VERIFY3S(val, >=, 0);
-
- return (val);
-}
-
-int
-pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
-{
- /* Allow the fallback to vm_fault to handle this */
- return (-1);
-}
-
-
-
-struct sglist_ent {
- vm_paddr_t sge_pa;
- size_t sge_len;
-};
-struct sglist {
- kmutex_t sg_lock;
- uint_t sg_refcnt;
- uint_t sg_len;
- uint_t sg_next;
- struct sglist_ent sg_entries[];
-};
-
-#define SG_SIZE(cnt) (sizeof (struct sglist) + \
- (sizeof (struct sglist_ent) * (cnt)))
-
-struct sglist *
-sglist_alloc(int nseg, int flags)
-{
- const size_t sz = SG_SIZE(nseg);
- const int flag = (flags & M_WAITOK) ? KM_SLEEP : KM_NOSLEEP;
- struct sglist *sg;
-
- ASSERT(nseg > 0);
-
- sg = kmem_zalloc(sz, flag);
- if (sg != NULL) {
- sg->sg_len = nseg;
- sg->sg_refcnt = 1;
- }
- return (sg);
-}
-
-void
-sglist_free(struct sglist *sg)
-{
- size_t sz;
-
- mutex_enter(&sg->sg_lock);
- if (sg->sg_refcnt > 1) {
- sg->sg_refcnt--;
- mutex_exit(&sg->sg_lock);
- return;
- }
-
- VERIFY(sg->sg_refcnt == 1);
- sg->sg_refcnt = 0;
- sz = SG_SIZE(sg->sg_len);
- mutex_exit(&sg->sg_lock);
- kmem_free(sg, sz);
-}
-
-int
-sglist_append_phys(struct sglist *sg, vm_paddr_t pa, size_t len)
-{
- uint_t idx;
- struct sglist_ent *ent;
-
- /* Restrict to page-aligned entries */
- if ((pa & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0 || len == 0) {
- return (EINVAL);
- }
-
- mutex_enter(&sg->sg_lock);
- idx = sg->sg_next;
- if (idx >= sg->sg_len) {
- mutex_exit(&sg->sg_lock);
- return (ENOSPC);
- }
-
- ent = &sg->sg_entries[idx];
- ASSERT(ent->sge_pa == 0 && ent->sge_len == 0);
- ent->sge_pa = pa;
- ent->sge_len = len;
- sg->sg_next++;
-
- mutex_exit(&sg->sg_lock);
- return (0);
-}
-
-
-static pfn_t
-vm_object_pager_none(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
-{
- panic("bad vm_object pager");
- return (PFN_INVALID);
-}
-
-static pfn_t
-vm_object_pager_reservoir(vm_object_t vmo, uintptr_t off, pfn_t *lpfn,
- uint_t *lvl)
-{
- vmmr_region_t *region;
- pfn_t pfn;
-
- ASSERT(vmo->vmo_type == OBJT_DEFAULT);
-
- region = vmo->vmo_data;
- pfn = vmmr_region_pfn_at(region, off & PAGEMASK);
-
- /* TODO: handle large pages */
- if (lpfn != NULL) {
- *lpfn = pfn;
- }
- if (lvl != NULL) {
- *lvl = 0;
- }
- return (pfn);
-}
-
-static pfn_t
-vm_object_pager_sg(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
-{
- const uintptr_t aoff = ALIGN2PAGE(off);
- uint_t level = 0;
- uintptr_t pos = 0;
- struct sglist *sg;
- struct sglist_ent *ent;
- pfn_t pfn = PFN_INVALID;
-
- ASSERT(vmo->vmo_type == OBJT_SG);
- ASSERT(off < vmo->vmo_size);
-
- sg = vmo->vmo_data;
- if (sg == NULL) {
- return (PFN_INVALID);
- }
-
- ent = &sg->sg_entries[0];
- for (uint_t i = 0; i < sg->sg_next; i++, ent++) {
- if (aoff >= pos && aoff < (pos + ent->sge_len)) {
- /* XXXJOY: Punt on large pages for now */
- level = 0;
- pfn = mmu_btop(ent->sge_pa + (aoff - pos));
- break;
- }
- pos += ent->sge_len;
- }
-
- if (lpfn != 0) {
- *lpfn = pfn;
- }
- if (lvl != 0) {
- *lvl = level;
- }
- return (pfn);
-}
-
-vm_object_t
-vm_object_allocate(objtype_t type, vm_pindex_t psize, bool transient)
-{
- vm_object_t vmo;
- const size_t size = ptob((size_t)psize);
-
- vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
- mutex_init(&vmo->vmo_lock, NULL, MUTEX_DEFAULT, NULL);
-
- /* For now, these are to stay fixed after allocation */
- vmo->vmo_type = type;
- vmo->vmo_size = size;
- vmo->vmo_attr = VM_MEMATTR_DEFAULT;
-
- switch (type) {
- case OBJT_DEFAULT: {
-
- /* TODO: opt-in to larger pages? */
- int err;
- vmmr_region_t *region = NULL;
-
- err = vmmr_alloc(size, transient, &region);
- if (err != 0) {
- mutex_destroy(&vmo->vmo_lock);
- kmem_free(vmo, sizeof (*vmo));
- return (NULL);
- }
- vmo->vmo_data = region;
- vmo->vmo_pager = vm_object_pager_reservoir;
- }
- break;
- case OBJT_SG:
- vmo->vmo_data = NULL;
- vmo->vmo_pager = vm_object_pager_sg;
- break;
- default:
- panic("Unsupported vm_object type");
- break;
- }
-
- vmo->vmo_refcnt = 1;
- return (vmo);
-}
-
-vm_object_t
-vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size,
- vm_prot_t prot, vm_ooffset_t off, void *cred)
-{
- struct vm_object *vmo;
- struct sglist *sg = (struct sglist *)handle;
-
- /* XXXJOY: be very restrictive for now */
- VERIFY(type == OBJT_SG);
- VERIFY(off == 0);
-
- vmo = vm_object_allocate(type, size, false);
- vmo->vmo_data = sg;
-
- mutex_enter(&sg->sg_lock);
- VERIFY(sg->sg_refcnt++ >= 1);
- mutex_exit(&sg->sg_lock);
-
- return (vmo);
-}
-
-void
-vm_object_deallocate(vm_object_t vmo)
-{
- ASSERT(vmo != NULL);
-
- uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
- /* underflow would be a deadly serious mistake */
- VERIFY3U(ref, !=, UINT_MAX);
- if (ref != 0) {
- return;
- }
-
- switch (vmo->vmo_type) {
- case OBJT_DEFAULT:
- vmmr_free((vmmr_region_t *)vmo->vmo_data);
- break;
- case OBJT_SG:
- sglist_free((struct sglist *)vmo->vmo_data);
- break;
- default:
- panic("Unsupported vm_object type");
- break;
- }
-
- vmo->vmo_pager = vm_object_pager_none;
- vmo->vmo_data = NULL;
- vmo->vmo_size = 0;
- mutex_destroy(&vmo->vmo_lock);
- kmem_free(vmo, sizeof (*vmo));
-}
-
-CTASSERT(VM_MEMATTR_UNCACHEABLE == MTRR_TYPE_UC);
-CTASSERT(VM_MEMATTR_WRITE_BACK == MTRR_TYPE_WB);
-int
-vm_object_set_memattr(vm_object_t vmo, vm_memattr_t attr)
-{
- ASSERT(MUTEX_HELD(&vmo->vmo_lock));
-
- switch (attr) {
- case VM_MEMATTR_UNCACHEABLE:
- case VM_MEMATTR_WRITE_BACK:
- vmo->vmo_attr = attr;
- return (0);
- default:
- break;
- }
- return (EINVAL);
-}
-
-void
-vm_object_reference(vm_object_t vmo)
-{
- ASSERT(vmo != NULL);
-
- uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
- /* overflow would be a deadly serious mistake */
- VERIFY3U(ref, !=, 0);
-}
-
-pfn_t
-vm_object_pfn(vm_object_t vmo, uintptr_t off)
-{
- /* This is expected to be used only on reservoir-backed memory */
- if (vmo->vmo_type != OBJT_DEFAULT) {
- return (PFN_INVALID);
- }
-
- return (vmo->vmo_pager(vmo, off, NULL, NULL));
-}
-
-static vmspace_mapping_t *
-vm_mapping_find(struct vmspace *vms, uintptr_t addr, size_t size,
- boolean_t no_lock)
-{
- vmspace_mapping_t *vmsm;
- list_t *ml = &vms->vms_maplist;
- const uintptr_t range_end = addr + size;
-
- ASSERT(addr <= range_end);
-
- if (no_lock) {
- /*
- * This check should be superflous with the protections
- * promised by the bhyve logic which calls into the VM shim.
- * All the same, it is cheap to be paranoid.
- */
- VERIFY(!vms->vms_map_changing);
- } else {
- VERIFY(MUTEX_HELD(&vms->vms_lock));
- }
-
- if (addr >= vms->vms_size) {
- return (NULL);
- }
- for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
- const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
-
- if (addr >= vmsm->vmsm_addr && addr < seg_end) {
- if (range_end <= seg_end) {
- return (vmsm);
- } else {
- return (NULL);
- }
- }
- }
- return (NULL);
-}
-
-static boolean_t
-vm_mapping_gap(struct vmspace *vms, uintptr_t addr, size_t size)
-{
- vmspace_mapping_t *vmsm;
- list_t *ml = &vms->vms_maplist;
- const uintptr_t range_end = addr + size - 1;
-
- ASSERT(MUTEX_HELD(&vms->vms_lock));
- ASSERT(size > 0);
-
- for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
- const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1;
-
- /*
- * The two ranges do not overlap if the start of either of
- * them is after the end of the other.
- */
- if (vmsm->vmsm_addr > range_end || addr > seg_end)
- continue;
- return (B_FALSE);
- }
- return (B_TRUE);
-}
-
-static void
-vm_mapping_remove(struct vmspace *vms, vmspace_mapping_t *vmsm)
-{
- list_t *ml = &vms->vms_maplist;
-
- ASSERT(MUTEX_HELD(&vms->vms_lock));
- ASSERT(vms->vms_map_changing);
-
- list_remove(ml, vmsm);
- vm_object_deallocate(vmsm->vmsm_object);
- kmem_free(vmsm, sizeof (*vmsm));
-}
-
-int
-vm_fault(vm_map_t map, vm_offset_t off, vm_prot_t type, int flag)
-{
- struct vmspace *vms = VMMAP_TO_VMSPACE(map);
- pmap_t pmap = &vms->vms_pmap;
- void *pmi = pmap->pm_impl;
- const uintptr_t addr = off;
- vmspace_mapping_t *vmsm;
- struct vm_object *vmo;
- uint_t prot, map_lvl;
- pfn_t pfn;
- uintptr_t map_addr;
-
- mutex_enter(&vms->vms_lock);
- if (vmspace_pmap_iswired(vms, addr, &prot) == 0) {
- int err = 0;
-
- /*
- * It is possible that multiple vCPUs will race to fault-in a
- * given address. In such cases, the race loser(s) will
- * encounter the already-mapped page, needing to do nothing
- * more than consider it a success.
- *
- * If the fault exceeds protection, it is an obvious error.
- */
- if ((prot & type) != type) {
- err = FC_PROT;
- }
-
- mutex_exit(&vms->vms_lock);
- return (err);
- }
-
- /* Try to wire up the address */
- if ((vmsm = vm_mapping_find(vms, addr, 0, B_FALSE)) == NULL) {
- mutex_exit(&vms->vms_lock);
- return (FC_NOMAP);
- }
- vmo = vmsm->vmsm_object;
- prot = vmsm->vmsm_prot;
-
- /* XXXJOY: punt on large pages for now */
- pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, addr), NULL, NULL);
- map_lvl = 0;
- map_addr = P2ALIGN((uintptr_t)addr, LEVEL_SIZE(map_lvl));
- VERIFY(pfn != PFN_INVALID);
-
- /*
- * If pmap failure is to be handled, the previously acquired page locks
- * would need to be released.
- */
- VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, prot,
- vmo->vmo_attr));
- pmap->pm_eptgen++;
-
- mutex_exit(&vms->vms_lock);
- return (0);
-}
-
-int
-vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
- vm_prot_t prot, vm_page_t *ma, int max_count)
-{
- struct vmspace *vms = VMMAP_TO_VMSPACE(map);
- const uintptr_t vaddr = addr;
- vmspace_mapping_t *vmsm;
- struct vm_object *vmo;
- vm_page_t vmp;
-
- ASSERT0(addr & PAGEOFFSET);
- ASSERT(len == PAGESIZE);
- ASSERT(max_count == 1);
-
- /*
- * Unlike practically all of the other logic that queries or
- * manipulates vmspace objects, vm_fault_quick_hold_pages() does so
- * without holding vms_lock. This is safe because bhyve ensures that
- * changes to the vmspace map occur only when all other threads have
- * been excluded from running.
- *
- * Since this task can count on vms_maplist remaining static and does
- * not need to modify the pmap (like vm_fault might), it can proceed
- * without the lock. The vm_object has independent refcount and lock
- * protection, while the vmo_pager methods do not rely on vms_lock for
- * safety.
- *
- * Performing this work without locks is critical in cases where
- * multiple vCPUs require simultaneous instruction emulation, such as
- * for frequent guest APIC accesses on a host that lacks hardware
- * acceleration for that behavior.
- */
- if ((vmsm = vm_mapping_find(vms, vaddr, PAGESIZE, B_TRUE)) == NULL ||
- (prot & ~vmsm->vmsm_prot) != 0) {
- return (-1);
- }
-
- vmp = kmem_zalloc(sizeof (struct vm_page), KM_SLEEP);
-
- vmo = vmsm->vmsm_object;
- vm_object_reference(vmo);
- vmp->vmp_obj_held = vmo;
- vmp->vmp_pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, vaddr), NULL,
- NULL);
-
- *ma = vmp;
- return (1);
-}
-
-/*
- * Find a suitable location for a mapping (and install it).
- */
-int
-vm_map_find(vm_map_t map, vm_object_t vmo, vm_ooffset_t off, vm_offset_t *addr,
- vm_size_t len, vm_offset_t max_addr, int find_flags, vm_prot_t prot,
- vm_prot_t prot_max, int cow)
-{
- struct vmspace *vms = VMMAP_TO_VMSPACE(map);
- const size_t size = (size_t)len;
- const uintptr_t uoff = (uintptr_t)off;
- uintptr_t base = *addr;
- vmspace_mapping_t *vmsm;
- int res = 0;
-
- /* For use in vmm only */
- VERIFY(find_flags == VMFS_NO_SPACE); /* essentially MAP_FIXED */
- VERIFY(max_addr == 0);
-
- if (size == 0 || off < 0 ||
- uoff >= (uoff + size) || vmo->vmo_size < (uoff + size)) {
- return (EINVAL);
- }
-
- if (*addr >= vms->vms_size) {
- return (ENOMEM);
- }
-
- vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
-
- mutex_enter(&vms->vms_lock);
- vms->vms_map_changing = B_TRUE;
- if (!vm_mapping_gap(vms, base, size)) {
- res = ENOMEM;
- goto out;
- }
-
- if (res == 0) {
- vmsm->vmsm_object = vmo;
- vmsm->vmsm_addr = base;
- vmsm->vmsm_len = len;
- vmsm->vmsm_offset = (off_t)uoff;
- vmsm->vmsm_prot = prot;
- list_insert_tail(&vms->vms_maplist, vmsm);
-
- /* Communicate out the chosen address. */
- *addr = (vm_offset_t)base;
- }
-out:
- vms->vms_map_changing = B_FALSE;
- mutex_exit(&vms->vms_lock);
- if (res != 0) {
- kmem_free(vmsm, sizeof (*vmsm));
- }
- return (res);
-}
-
-int
-vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
-{
- struct vmspace *vms = VMMAP_TO_VMSPACE(map);
- pmap_t pmap = &vms->vms_pmap;
- void *pmi = pmap->pm_impl;
- const uintptr_t addr = start;
- const size_t size = (size_t)(end - start);
- vmspace_mapping_t *vmsm;
-
- ASSERT(start < end);
-
- mutex_enter(&vms->vms_lock);
- vms->vms_map_changing = B_TRUE;
- /* expect to match existing mapping exactly */
- if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL ||
- vmsm->vmsm_addr != addr || vmsm->vmsm_len != size) {
- vms->vms_map_changing = B_FALSE;
- mutex_exit(&vms->vms_lock);
- return (ENOENT);
- }
-
- (void) pmap->pm_ops->vpo_unmap(pmi, addr, end);
- pmap->pm_eptgen++;
-
- vm_mapping_remove(vms, vmsm);
- vms->vms_map_changing = B_FALSE;
- mutex_exit(&vms->vms_lock);
- return (0);
-}
-
-int
-vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
-{
- struct vmspace *vms = VMMAP_TO_VMSPACE(map);
- pmap_t pmap = &vms->vms_pmap;
- void *pmi = pmap->pm_impl;
- const uintptr_t addr = start;
- const size_t size = end - start;
- vmspace_mapping_t *vmsm;
- struct vm_object *vmo;
- uint_t prot;
-
- mutex_enter(&vms->vms_lock);
-
- /* For the time being, only exact-match mappings are expected */
- if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) {
- mutex_exit(&vms->vms_lock);
- return (FC_NOMAP);
- }
- vmo = vmsm->vmsm_object;
- prot = vmsm->vmsm_prot;
-
- for (uintptr_t pos = addr; pos < end; ) {
- pfn_t pfn;
- uintptr_t pg_size, map_addr;
- uint_t map_lvl = 0;
-
- /* XXXJOY: punt on large pages for now */
- pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, pos), NULL, NULL);
- pg_size = LEVEL_SIZE(map_lvl);
- map_addr = P2ALIGN(pos, pg_size);
- VERIFY(pfn != PFN_INVALID);
-
- VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl,
- prot, vmo->vmo_attr));
- vms->vms_pmap.pm_eptgen++;
-
- pos += pg_size;
- }
-
- mutex_exit(&vms->vms_lock);
-
- return (0);
-}
-
-/* Provided custom for bhyve 'devmem' segment mapping */
-int
-vm_segmap_obj(vm_object_t vmo, off_t map_off, size_t size, struct as *as,
- caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
-{
- int err;
-
- VERIFY(map_off >= 0);
- VERIFY(size <= vmo->vmo_size);
- VERIFY((size + map_off) <= vmo->vmo_size);
-
- if (vmo->vmo_type != OBJT_DEFAULT) {
- /* Only support default objects for now */
- return (ENOTSUP);
- }
-
- as_rangelock(as);
-
- err = choose_addr(as, addrp, size, 0, ADDR_VACALIGN, flags);
- if (err == 0) {
- segvmm_crargs_t svma;
-
- svma.obj = vmo;
- svma.offset = map_off;
- svma.prot = prot;
-
- err = as_map(as, *addrp, size, segvmm_create, &svma);
- }
-
- as_rangeunlock(as);
- return (err);
-}
-
-int
-vm_segmap_space(struct vmspace *vms, off_t off, struct as *as, caddr_t *addrp,
- off_t len, uint_t prot, uint_t maxprot, uint_t flags)
-{
- const uintptr_t addr = (uintptr_t)off;
- const size_t size = (uintptr_t)len;
- vmspace_mapping_t *vmsm;
- vm_object_t vmo;
- int err;
-
- if (off < 0 || len <= 0 ||
- (addr & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
- return (EINVAL);
- }
-
- mutex_enter(&vms->vms_lock);
- if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) {
- mutex_exit(&vms->vms_lock);
- return (ENXIO);
- }
- if ((prot & ~(vmsm->vmsm_prot | PROT_USER)) != 0) {
- mutex_exit(&vms->vms_lock);
- return (EACCES);
- }
- vmo = vmsm->vmsm_object;
- if (vmo->vmo_type != OBJT_DEFAULT) {
- /* Only support default objects for now */
- mutex_exit(&vms->vms_lock);
- return (ENOTSUP);
- }
-
- as_rangelock(as);
-
- err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
- if (err == 0) {
- segvmm_crargs_t svma;
- const uintptr_t addroff = addr - vmsm->vmsm_addr;
- const uintptr_t mapoff = addroff + vmsm->vmsm_offset;
-
- VERIFY(addroff < vmsm->vmsm_len);
- VERIFY((vmsm->vmsm_len - addroff) >= size);
- VERIFY(mapoff < vmo->vmo_size);
- VERIFY((mapoff + size) <= vmo->vmo_size);
-
- svma.obj = vmo;
- svma.offset = mapoff;
- svma.prot = prot;
-
- err = as_map(as, *addrp, len, segvmm_create, &svma);
- }
-
- as_rangeunlock(as);
- mutex_exit(&vms->vms_lock);
- return (err);
-}
-
-void
-vm_page_unwire(vm_page_t vmp, uint8_t nqueue __unused)
-{
- ASSERT(!MUTEX_HELD(&vmp->vmp_lock));
- mutex_enter(&vmp->vmp_lock);
-
- VERIFY(vmp->vmp_pfn != PFN_INVALID);
-
- vm_object_deallocate(vmp->vmp_obj_held);
- vmp->vmp_obj_held = NULL;
- vmp->vmp_pfn = PFN_INVALID;
-
- mutex_exit(&vmp->vmp_lock);
-
- mutex_destroy(&vmp->vmp_lock);
- kmem_free(vmp, sizeof (*vmp));
-}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_vm.c
new file mode 100644
index 0000000000..debeec605a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_vm.c
@@ -0,0 +1,1430 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
+ */
+
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/list.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sysmacros.h>
+#include <sys/machsystm.h>
+#include <sys/vmsystm.h>
+#include <sys/malloc.h>
+#include <sys/x86_archext.h>
+#include <vm/as.h>
+#include <vm/hat_i86.h>
+#include <vm/seg_vn.h>
+#include <vm/seg_kmem.h>
+
+#include <sys/vmm_vm.h>
+#include <sys/seg_vmm.h>
+#include <sys/vmm_kernel.h>
+#include <sys/vmm_reservoir.h>
+#include <sys/vmm_gpt.h>
+
+
+/*
+ * VMM Virtual Memory
+ *
+ * History
+ *
+ * When bhyve was ported to illumos, one significant hole was handling guest
+ * memory and memory accesses. In the original Pluribus port, bhyve itself
+ * manually handled the EPT structures for guest memory. The updated sources
+ * (from FreeBSD 11) took a different approach, using the native FreeBSD VM
+ * system for memory allocations and management of the EPT structures. Keeping
+ * source differences to a minimum was a priority, so illumos-bhyve implemented
+ * a makeshift "VM shim" which exposed the bare minimum of those interfaces to
+ * boot and run guests.
+ *
+ * While the VM shim was successful in getting illumos-bhyve to a functional
+ * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the
+ * compatibility interfaces made it awkward to use. As source differences with
+ * the upstream kernel code became less of a concern, and upcoming features
+ * (such as live migration) would demand more of those VM interfaces, it became
+ * clear that an overhaul was prudent.
+ *
+ * Design
+ *
+ * The new VM system for bhyve retains a number of the same concepts as what it
+ * replaces:
+ *
+ * - `vmspace_t` is the top-level entity for a guest memory space
+ * - `vm_object_t` represents a memory object which can be mapped into a vmspace
+ * - `vm_page_t` represents a page hold within a given vmspace, providing access
+ * to the underlying memory page
+ *
+ * Unlike the old code, where most of the involved structures were exposed via
+ * public definitions, this replacement VM interface keeps all involved
+ * structures opaque to consumers. Furthermore, there is a clear delineation
+ * between infrequent administrative operations (such as mapping/unmapping
+ * regions) and common data-path operations (attempting a page hold at a given
+ * guest-physical address). Those administrative operations are performed
+ * directly against the vmspace, whereas the data-path operations are performed
+ * through a `vm_client_t` handle. That VM client abstraction is meant to
+ * reduce contention and overhead for frequent access operations and provide
+ * debugging insight into how different subcomponents are accessing the vmspace.
+ * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv
+ * interface) and each VMM userspace segment mapping.
+ *
+ * Exclusion
+ *
+ * Making changes to the vmspace (such as mapping or unmapping regions) requires
+ * other accessors be excluded while the change is underway to prevent them from
+ * observing invalid intermediate states. A simple approach could use a mutex
+ * or rwlock to achieve this, but that risks contention when the rate of access
+ * to the vmspace is high.
+ *
+ * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion
+ * at a per-vm_client_t basis. While this raises the cost for vmspace changes,
+ * it means that the much more common page accesses through the vm_client can
+ * normally proceed unimpeded and independently.
+ *
+ * When a change to the vmspace is required, the caller will put the vmspace in
+ * a 'hold' state, iterating over all associated vm_client instances, waiting
+ * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before
+ * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on
+ * the vm_client which would access the vmspace state (vmc_hold or vmc_fault)
+ * will block until the hold condition is cleared. Once the hold is asserted
+ * for all clients, the vmspace change can proceed with confidence. Upon
+ * completion of that operation, VCS_HOLD is cleared from the clients, and they
+ * are released to resume vmspace accesses.
+ *
+ * vCPU Consumers
+ *
+ * Access to the vmspace for vCPUs running in guest context is different from
+ * emulation-related vm_client activity: they solely rely on the contents of the
+ * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude
+ * client access is not feasible when entering guest context, since interrupts
+ * are disabled, making it impossible to block entry. This is not a concern as
+ * long as vmspace modifications never place the page tables in invalid states
+ * (either intermediate, or final). The vm_client hold mechanism does provide
+ * the means to IPI vCPU consumers which will trigger a notification once they
+ * report their exit from guest context. This can be used to ensure that page
+ * table modifications are made visible to those vCPUs within a certain
+ * time frame.
+ */
+
+typedef struct vmspace_mapping {
+ list_node_t vmsm_node;
+ vm_object_t *vmsm_object; /* object backing this mapping */
+ uintptr_t vmsm_addr; /* start addr in vmspace for mapping */
+ size_t vmsm_len; /* length (in bytes) of mapping */
+ off_t vmsm_offset; /* byte offset into object */
+ uint_t vmsm_prot;
+} vmspace_mapping_t;
+
+#define VMSM_OFFSET(vmsm, addr) ( \
+ (vmsm)->vmsm_offset + \
+ ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
+
+typedef enum vm_client_state {
+ VCS_IDLE = 0,
+ /* currently accessing vmspace for client operation (hold or fault) */
+ VCS_ACTIVE = (1 << 0),
+ /* client hold requested/asserted */
+ VCS_HOLD = (1 << 1),
+ /* vCPU is accessing page tables in guest context */
+ VCS_ON_CPU = (1 << 2),
+ /* client has been orphaned (no more access to vmspace) */
+ VCS_ORPHANED = (1 << 3),
+ /* client undergoing destroy operation */
+ VCS_DESTROY = (1 << 4),
+} vm_client_state_t;
+
+struct vmspace {
+ kmutex_t vms_lock;
+ kcondvar_t vms_cv;
+ bool vms_held;
+ uintptr_t vms_size; /* immutable after creation */
+
+ /* (nested) page table state */
+ vmm_gpt_t *vms_gpt;
+ uint64_t vms_pt_gen;
+ uint64_t vms_pages_mapped;
+ bool vms_track_dirty;
+
+ list_t vms_maplist;
+ list_t vms_clients;
+};
+
+struct vm_client {
+ vmspace_t *vmc_space;
+ list_node_t vmc_node;
+
+ kmutex_t vmc_lock;
+ kcondvar_t vmc_cv;
+ vm_client_state_t vmc_state;
+ int vmc_cpu_active;
+ uint64_t vmc_cpu_gen;
+ bool vmc_track_dirty;
+ vmc_inval_cb_t vmc_inval_func;
+ void *vmc_inval_data;
+
+ list_t vmc_held_pages;
+};
+
+typedef enum vm_object_type {
+ VMOT_NONE,
+ VMOT_MEM,
+ VMOT_MMIO,
+} vm_object_type_t;
+
+struct vm_object {
+ uint_t vmo_refcnt; /* manipulated with atomic ops */
+
+ /* Fields below are fixed at creation time */
+ vm_object_type_t vmo_type;
+ size_t vmo_size;
+ void *vmo_data;
+ uint8_t vmo_attr;
+};
+
+struct vm_page {
+ vm_client_t *vmp_client;
+ list_node_t vmp_node;
+ vm_page_t *vmp_chain;
+ uintptr_t vmp_gpa;
+ pfn_t vmp_pfn;
+ uint64_t *vmp_ptep;
+ vm_object_t *vmp_obj_ref;
+ int vmp_prot;
+};
+
+#define VMC_IS_ACTIVE(vmc) (((vmc)->vmc_state & VCS_ACTIVE) != 0)
+
+static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t);
+static void vmc_space_hold(vm_client_t *);
+static void vmc_space_release(vm_client_t *, bool);
+static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t);
+static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *);
+static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *);
+
+
+/*
+ * Create a new vmspace with a maximum address of `end`.
+ */
+vmspace_t *
+vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty)
+{
+ vmspace_t *vms;
+ const uintptr_t size = end + 1;
+
+ /*
+ * This whole mess is built on the assumption that a 64-bit address
+ * space is available to work with for the various pagetable tricks.
+ */
+ VERIFY(size > 0 && (size & PAGEOFFSET) == 0 &&
+ size <= (uintptr_t)USERLIMIT);
+
+ vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
+ vms->vms_size = size;
+ list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
+ offsetof(vmspace_mapping_t, vmsm_node));
+ list_create(&vms->vms_clients, sizeof (vm_client_t),
+ offsetof(vm_client_t, vmc_node));
+
+ vms->vms_gpt = vmm_gpt_alloc(pte_ops);
+ vms->vms_pt_gen = 1;
+ vms->vms_track_dirty = track_dirty;
+
+ return (vms);
+}
+
+/*
+ * Destroy a vmspace. All regions in the space must be unmapped. Any remaining
+ * clients will be orphaned.
+ */
+void
+vmspace_destroy(vmspace_t *vms)
+{
+ mutex_enter(&vms->vms_lock);
+ VERIFY(list_is_empty(&vms->vms_maplist));
+
+ if (!list_is_empty(&vms->vms_clients)) {
+ vm_client_t *vmc = list_head(&vms->vms_clients);
+ while (vmc != NULL) {
+ vmc = vmc_space_orphan(vmc, vms);
+ }
+ /*
+ * Wait for any clients which were in the process of destroying
+ * themselves to disappear.
+ */
+ while (!list_is_empty(&vms->vms_clients)) {
+ cv_wait(&vms->vms_cv, &vms->vms_lock);
+ }
+ }
+ VERIFY(list_is_empty(&vms->vms_clients));
+
+ vmm_gpt_free(vms->vms_gpt);
+ mutex_exit(&vms->vms_lock);
+
+ mutex_destroy(&vms->vms_lock);
+ cv_destroy(&vms->vms_cv);
+ list_destroy(&vms->vms_maplist);
+ list_destroy(&vms->vms_clients);
+
+ kmem_free(vms, sizeof (*vms));
+}
+
+/*
+ * Retrieve the count of resident (mapped into the page tables) pages.
+ */
+uint64_t
+vmspace_resident_count(vmspace_t *vms)
+{
+ return (vms->vms_pages_mapped);
+}
+
+static pfn_t
+vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off)
+{
+ vmmr_region_t *region;
+ pfn_t pfn;
+
+ ASSERT3U(vmo->vmo_type, ==, VMOT_MEM);
+
+ region = vmo->vmo_data;
+ pfn = vmmr_region_pfn_at(region, off);
+
+ return (pfn);
+}
+
+static pfn_t
+vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off)
+{
+ pfn_t pfn;
+
+ ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO);
+ ASSERT3P(vmo->vmo_data, !=, NULL);
+ ASSERT3U(off, <, vmo->vmo_size);
+
+ pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT;
+
+ return (pfn);
+}
+
+/*
+ * Allocate a VM object backed by VMM reservoir memory.
+ */
+vm_object_t *
+vm_object_mem_allocate(size_t size, bool transient)
+{
+ int err;
+ vmmr_region_t *region = NULL;
+ vm_object_t *vmo;
+
+ ASSERT3U(size, !=, 0);
+ ASSERT3U(size & PAGEOFFSET, ==, 0);
+
+ err = vmmr_alloc(size, transient, &region);
+ if (err != 0) {
+ return (NULL);
+ }
+
+ vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
+
+ /* For now, these are to stay fixed after allocation */
+ vmo->vmo_type = VMOT_MEM;
+ vmo->vmo_size = size;
+ vmo->vmo_attr = MTRR_TYPE_WB;
+ vmo->vmo_data = region;
+ vmo->vmo_refcnt = 1;
+
+ return (vmo);
+}
+
+static vm_object_t *
+vm_object_mmio_allocate(size_t size, uintptr_t hpa)
+{
+ vm_object_t *vmo;
+
+ ASSERT3U(size, !=, 0);
+ ASSERT3U(size & PAGEOFFSET, ==, 0);
+ ASSERT3U(hpa & PAGEOFFSET, ==, 0);
+
+ vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
+
+ /* For now, these are to stay fixed after allocation */
+ vmo->vmo_type = VMOT_MMIO;
+ vmo->vmo_size = size;
+ vmo->vmo_attr = MTRR_TYPE_UC;
+ vmo->vmo_data = (void *)hpa;
+ vmo->vmo_refcnt = 1;
+
+ return (vmo);
+}
+
+/*
+ * Allocate a VM object backed by an existing range of physical memory.
+ */
+vm_object_t *
+vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa)
+{
+ int error;
+ vm_object_t *obj;
+
+ obj = vm_object_mmio_allocate(len, hpa);
+ if (obj != NULL) {
+ error = vmspace_map(vmspace, obj, 0, gpa, len,
+ PROT_READ | PROT_WRITE);
+ if (error != 0) {
+ vm_object_release(obj);
+ obj = NULL;
+ }
+ }
+
+ return (obj);
+}
+
+/*
+ * Release a vm_object reference
+ */
+void
+vm_object_release(vm_object_t *vmo)
+{
+ ASSERT(vmo != NULL);
+
+ uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
+ /* underflow would be a deadly serious mistake */
+ VERIFY3U(ref, !=, UINT_MAX);
+ if (ref != 0) {
+ return;
+ }
+
+ switch (vmo->vmo_type) {
+ case VMOT_MEM:
+ vmmr_free((vmmr_region_t *)vmo->vmo_data);
+ break;
+ case VMOT_MMIO:
+ break;
+ default:
+ panic("unexpected object type %u", vmo->vmo_type);
+ break;
+ }
+
+ vmo->vmo_data = NULL;
+ vmo->vmo_size = 0;
+ kmem_free(vmo, sizeof (*vmo));
+}
+
+/*
+ * Increase refcount for vm_object reference
+ */
+void
+vm_object_reference(vm_object_t *vmo)
+{
+ ASSERT(vmo != NULL);
+
+ uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
+ /* overflow would be a deadly serious mistake */
+ VERIFY3U(ref, !=, 0);
+}
+
+/*
+ * Get the host-physical PFN for a given offset into a vm_object.
+ *
+ * The provided `off` must be within the allocated size of the vm_object.
+ */
+pfn_t
+vm_object_pfn(vm_object_t *vmo, uintptr_t off)
+{
+ const uintptr_t aligned_off = off & PAGEMASK;
+
+ switch (vmo->vmo_type) {
+ case VMOT_MEM:
+ return (vm_object_pager_reservoir(vmo, aligned_off));
+ case VMOT_MMIO:
+ return (vm_object_pager_mmio(vmo, aligned_off));
+ case VMOT_NONE:
+ break;
+ }
+ panic("unexpected object type %u", vmo->vmo_type);
+}
+
+static vmspace_mapping_t *
+vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size)
+{
+ vmspace_mapping_t *vmsm;
+ list_t *ml = &vms->vms_maplist;
+ const uintptr_t range_end = addr + size;
+
+ ASSERT3U(addr, <=, range_end);
+
+ if (addr >= vms->vms_size) {
+ return (NULL);
+ }
+ for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
+ const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
+
+ if (addr >= vmsm->vmsm_addr && addr < seg_end) {
+ if (range_end <= seg_end) {
+ return (vmsm);
+ } else {
+ return (NULL);
+ }
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Check to see if any mappings reside within [addr, addr + size) span in the
+ * vmspace, returning true if that span is indeed empty.
+ */
+static bool
+vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size)
+{
+ vmspace_mapping_t *vmsm;
+ list_t *ml = &vms->vms_maplist;
+ const uintptr_t range_end = addr + size - 1;
+
+ ASSERT(MUTEX_HELD(&vms->vms_lock));
+ ASSERT(size > 0);
+
+ for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
+ const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1;
+
+ /*
+ * The two ranges do not overlap if the start of either of
+ * them is after the end of the other.
+ */
+ if (vmsm->vmsm_addr > range_end || addr > seg_end)
+ continue;
+ return (false);
+ }
+ return (true);
+}
+
+static void
+vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm)
+{
+ list_t *ml = &vms->vms_maplist;
+
+ ASSERT(MUTEX_HELD(&vms->vms_lock));
+ ASSERT(vms->vms_held);
+
+ list_remove(ml, vmsm);
+ vm_object_release(vmsm->vmsm_object);
+ kmem_free(vmsm, sizeof (*vmsm));
+}
+
+/*
+ * Enter a hold state on the vmspace. This ensures that all VM clients
+ * associated with the vmspace are excluded from establishing new page holds,
+ * or any other actions which would require accessing vmspace state subject to
+ * potential change.
+ *
+ * Returns with vmspace_t`vms_lock held.
+ */
+static void
+vmspace_hold_enter(vmspace_t *vms)
+{
+ mutex_enter(&vms->vms_lock);
+ VERIFY(!vms->vms_held);
+
+ vm_client_t *vmc = list_head(&vms->vms_clients);
+ for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
+ vmc_space_hold(vmc);
+ }
+ vms->vms_held = true;
+}
+
+/*
+ * Exit a hold state on the vmspace. This releases all VM clients associated
+ * with the vmspace to be able to establish new page holds, and partake in other
+ * actions which require accessing changed vmspace state. If `kick_on_cpu` is
+ * true, then any CPUs actively using the page tables will be IPIed, and the
+ * call will block until they have acknowledged being ready to use the latest
+ * state of the tables.
+ *
+ * Requires vmspace_t`vms_lock be held, which is released as part of the call.
+ */
+static void
+vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu)
+{
+ ASSERT(MUTEX_HELD(&vms->vms_lock));
+ VERIFY(vms->vms_held);
+
+ vm_client_t *vmc = list_head(&vms->vms_clients);
+ for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
+ vmc_space_release(vmc, kick_on_cpu);
+ }
+ vms->vms_held = false;
+ mutex_exit(&vms->vms_lock);
+}
+
+/*
+ * Attempt to map a vm_object span into the vmspace.
+ *
+ * Requirements:
+ * - `obj_off`, `addr`, and `len` must be page-aligned
+ * - `obj_off` cannot be greater than the allocated size of the object
+ * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated
+ * size of the object
+ * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address
+ * of the vmspace
+ */
+int
+vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr,
+ size_t len, uint8_t prot)
+{
+ vmspace_mapping_t *vmsm;
+ int res = 0;
+
+ if (len == 0 || (addr + len) < addr ||
+ obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) {
+ return (EINVAL);
+ }
+ if ((addr + len) >= vms->vms_size) {
+ return (ENOMEM);
+ }
+
+ vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
+
+ vmspace_hold_enter(vms);
+ if (!vm_mapping_gap(vms, addr, len)) {
+ kmem_free(vmsm, sizeof (*vmsm));
+ res = ENOMEM;
+ } else {
+ vmsm->vmsm_object = vmo;
+ vmsm->vmsm_addr = addr;
+ vmsm->vmsm_len = len;
+ vmsm->vmsm_offset = (off_t)obj_off;
+ vmsm->vmsm_prot = prot;
+ list_insert_tail(&vms->vms_maplist, vmsm);
+
+ /*
+ * Make sure the GPT has tables ready for leaf entries across
+ * the entire new mapping.
+ */
+ vmm_gpt_populate_region(vms->vms_gpt, addr, addr + len);
+ }
+ vmspace_hold_exit(vms, false);
+ return (res);
+}
+
+/*
+ * Unmap a region of the vmspace.
+ *
+ * Presently the [start, end) span must equal a region previously mapped by a
+ * call to vmspace_map().
+ */
+int
+vmspace_unmap(vmspace_t *vms, uintptr_t start, uintptr_t end)
+{
+ const size_t size = (size_t)(end - start);
+ vmspace_mapping_t *vmsm;
+ vm_client_t *vmc;
+ uint64_t gen = 0;
+
+ ASSERT(start < end);
+
+ vmspace_hold_enter(vms);
+ /* expect to match existing mapping exactly */
+ if ((vmsm = vm_mapping_find(vms, start, size)) == NULL ||
+ vmsm->vmsm_addr != start || vmsm->vmsm_len != size) {
+ vmspace_hold_exit(vms, false);
+ return (ENOENT);
+ }
+
+ /* Prepare clients (and their held pages) for the unmap. */
+ for (vmc = list_head(&vms->vms_clients); vmc != NULL;
+ vmc = list_next(&vms->vms_clients, vmc)) {
+ vmc_space_unmap(vmc, start, size, vmsm->vmsm_object);
+ }
+
+ /* Clear all PTEs for region */
+ if (vmm_gpt_unmap_region(vms->vms_gpt, start, end) != 0) {
+ vms->vms_pt_gen++;
+ gen = vms->vms_pt_gen;
+ }
+ /* ... and the intermediate (directory) PTEs as well */
+ vmm_gpt_vacate_region(vms->vms_gpt, start, end);
+
+ /*
+ * If pages were actually unmapped from the GPT, provide clients with
+ * an invalidation notice.
+ */
+ if (gen != 0) {
+ for (vmc = list_head(&vms->vms_clients); vmc != NULL;
+ vmc = list_next(&vms->vms_clients, vmc)) {
+ vmc_space_invalidate(vmc, start, size, vms->vms_pt_gen);
+ }
+ }
+
+ vm_mapping_remove(vms, vmsm);
+ vmspace_hold_exit(vms, true);
+ return (0);
+}
+
+static int
+vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
+ uint64_t **ptepp)
+{
+ vmm_gpt_t *gpt = vms->vms_gpt;
+ uint64_t *entries[MAX_GPT_LEVEL], *leaf;
+ pfn_t pfn = PFN_INVALID;
+ uint_t prot;
+
+ ASSERT0(gpa & PAGEOFFSET);
+ ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE);
+
+ vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
+ leaf = entries[LEVEL1];
+ if (leaf == NULL) {
+ /*
+ * Since we populated the intermediate tables for any regions
+ * mapped in the GPT, an empty leaf entry indicates there is no
+ * mapping, populated or not, at this GPT.
+ */
+ return (FC_NOMAP);
+ }
+
+ if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) {
+ if ((req_prot & prot) != req_prot) {
+ return (FC_PROT);
+ }
+ } else {
+ vmspace_mapping_t *vmsm;
+ vm_object_t *vmo;
+
+ /*
+ * Because of the prior leaf check, we should be confident that
+ * _some_ mapping covers this GPA
+ */
+ vmsm = vm_mapping_find(vms, gpa, PAGESIZE);
+ VERIFY(vmsm != NULL);
+
+ if ((req_prot & vmsm->vmsm_prot) != req_prot) {
+ return (FC_PROT);
+ }
+ vmo = vmsm->vmsm_object;
+ pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
+ VERIFY(pfn != PFN_INVALID);
+
+ if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot,
+ vmo->vmo_attr)) {
+ atomic_inc_64(&vms->vms_pages_mapped);
+ }
+ }
+
+ ASSERT(pfn != PFN_INVALID && leaf != NULL);
+ if (pfnp != NULL) {
+ *pfnp = pfn;
+ }
+ if (ptepp != NULL) {
+ *ptepp = leaf;
+ }
+ return (0);
+}
+
+/*
+ * Populate (make resident in the page tables) a region of the vmspace.
+ *
+ * Presently the [start, end) span must equal a region previously mapped by a
+ * call to vmspace_map().
+ */
+int
+vmspace_populate(vmspace_t *vms, uintptr_t start, uintptr_t end)
+{
+ const size_t size = end - start;
+ vmspace_mapping_t *vmsm;
+
+ mutex_enter(&vms->vms_lock);
+
+ /* For the time being, only exact-match mappings are expected */
+ if ((vmsm = vm_mapping_find(vms, start, size)) == NULL) {
+ mutex_exit(&vms->vms_lock);
+ return (FC_NOMAP);
+ }
+
+ vm_object_t *vmo = vmsm->vmsm_object;
+ const int prot = vmsm->vmsm_prot;
+ const uint8_t attr = vmo->vmo_attr;
+ size_t populated = 0;
+ for (uintptr_t gpa = start & PAGEMASK; gpa < end; gpa += PAGESIZE) {
+ const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
+ VERIFY(pfn != PFN_INVALID);
+
+ if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) {
+ populated++;
+ }
+ }
+ atomic_add_64(&vms->vms_pages_mapped, populated);
+
+ mutex_exit(&vms->vms_lock);
+ return (0);
+}
+
+/*
+ * Allocate a client from a given vmspace.
+ */
+vm_client_t *
+vmspace_client_alloc(vmspace_t *vms)
+{
+ vm_client_t *vmc;
+
+ vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP);
+ vmc->vmc_space = vms;
+ mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL);
+ vmc->vmc_state = VCS_IDLE;
+ vmc->vmc_cpu_active = -1;
+ list_create(&vmc->vmc_held_pages, sizeof (vm_page_t),
+ offsetof(vm_page_t, vmp_node));
+ vmc->vmc_track_dirty = vms->vms_track_dirty;
+
+ mutex_enter(&vms->vms_lock);
+ list_insert_tail(&vms->vms_clients, vmc);
+ mutex_exit(&vms->vms_lock);
+
+ return (vmc);
+}
+
+/*
+ * Get the nested page table root pointer (EPTP/NCR3) value.
+ */
+uint64_t
+vmspace_table_root(vmspace_t *vms)
+{
+ return (vmm_gpt_get_pmtp(vms->vms_gpt));
+}
+
+/*
+ * Get the current generation number of the nested page table.
+ */
+uint64_t
+vmspace_table_gen(vmspace_t *vms)
+{
+ return (vms->vms_pt_gen);
+}
+
+/*
+ * Mark a vm_client as active. This will block if/while the client is held by
+ * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will
+ * fail if the vm_client has been orphaned.
+ */
+static int
+vmc_activate(vm_client_t *vmc)
+{
+ mutex_enter(&vmc->vmc_lock);
+ VERIFY0(vmc->vmc_state & VCS_ACTIVE);
+ if ((vmc->vmc_state & VCS_ORPHANED) != 0) {
+ return (ENXIO);
+ }
+ while ((vmc->vmc_state & VCS_HOLD) != 0) {
+ cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
+ }
+ vmc->vmc_state |= VCS_ACTIVE;
+ return (0);
+}
+
+/*
+ * Mark a vm_client as no longer active. It must be called with
+ * vm_client_t`vmc_lock already held, and will return with it released.
+ */
+static void
+vmc_deactivate(vm_client_t *vmc)
+{
+ ASSERT(MUTEX_HELD(&vmc->vmc_lock));
+ VERIFY(vmc->vmc_state & VCS_ACTIVE);
+
+ vmc->vmc_state ^= VCS_ACTIVE;
+ if ((vmc->vmc_state & VCS_HOLD) != 0) {
+ cv_broadcast(&vmc->vmc_cv);
+ }
+ mutex_exit(&vmc->vmc_lock);
+}
+
+/*
+ * Indicate that a CPU will be utilizing the nested page tables through this VM
+ * client. Interrupts (and/or the GIF) are expected to be disabled when calling
+ * this function. Returns the generation number of the nested page table (to be
+ * used for TLB invalidations).
+ */
+uint64_t
+vmc_table_enter(vm_client_t *vmc)
+{
+ vmspace_t *vms = vmc->vmc_space;
+ uint64_t gen;
+
+ ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
+ ASSERT3S(vmc->vmc_cpu_active, ==, -1);
+
+ /*
+ * Since the NPT activation occurs with interrupts disabled, this must
+ * be done without taking vmc_lock like normal.
+ */
+ gen = vms->vms_pt_gen;
+ vmc->vmc_cpu_active = CPU->cpu_id;
+ vmc->vmc_cpu_gen = gen;
+ atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU);
+
+ return (gen);
+}
+
+/*
+ * Indicate that this VM client is not longer (directly) using the underlying
+ * page tables. Interrupts (and/or the GIF) must be enabled prior to calling
+ * this function.
+ */
+void
+vmc_table_exit(vm_client_t *vmc)
+{
+ mutex_enter(&vmc->vmc_lock);
+
+ ASSERT(vmc->vmc_state & VCS_ON_CPU);
+ vmc->vmc_state ^= VCS_ON_CPU;
+ vmc->vmc_cpu_active = -1;
+ if ((vmc->vmc_state & VCS_HOLD) != 0) {
+ cv_broadcast(&vmc->vmc_cv);
+ }
+
+ mutex_exit(&vmc->vmc_lock);
+}
+
+static void
+vmc_space_hold(vm_client_t *vmc)
+{
+ mutex_enter(&vmc->vmc_lock);
+ VERIFY0(vmc->vmc_state & VCS_HOLD);
+
+ /*
+ * Because vmc_table_enter() alters vmc_state from a context where
+ * interrupts are disabled, it cannot pay heed to vmc_lock, so setting
+ * VMC_HOLD must be done atomically here.
+ */
+ atomic_or_uint(&vmc->vmc_state, VCS_HOLD);
+
+ /* Wait for client to go inactive */
+ while ((vmc->vmc_state & VCS_ACTIVE) != 0) {
+ cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
+ }
+ mutex_exit(&vmc->vmc_lock);
+}
+
+static void
+vmc_space_release(vm_client_t *vmc, bool kick_on_cpu)
+{
+ mutex_enter(&vmc->vmc_lock);
+ VERIFY(vmc->vmc_state & VCS_HOLD);
+
+ if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) {
+ poke_cpu(vmc->vmc_cpu_active);
+
+ while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
+ cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
+ }
+ }
+
+ /*
+ * Because vmc_table_enter() alters vmc_state from a context where
+ * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing
+ * VMC_HOLD must be done atomically here.
+ */
+ atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD);
+ mutex_exit(&vmc->vmc_lock);
+}
+
+static void
+vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size,
+ uint64_t gen)
+{
+ mutex_enter(&vmc->vmc_lock);
+ VERIFY(vmc->vmc_state & VCS_HOLD);
+ if ((vmc->vmc_state & VCS_ON_CPU) != 0) {
+ /*
+ * Wait for clients using an old generation of the page tables
+ * to exit guest context, where they subsequently flush the TLB
+ * for the new generation.
+ */
+ if (vmc->vmc_cpu_gen < gen) {
+ poke_cpu(vmc->vmc_cpu_active);
+
+ while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
+ cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
+ }
+ }
+ }
+ if (vmc->vmc_inval_func != NULL) {
+ vmc_inval_cb_t func = vmc->vmc_inval_func;
+ void *data = vmc->vmc_inval_data;
+
+ /*
+ * Perform the actual invalidation call outside vmc_lock to
+ * avoid lock ordering issues in the consumer. Since the client
+ * is under VCS_HOLD, this is safe.
+ */
+ mutex_exit(&vmc->vmc_lock);
+ func(data, addr, size);
+ mutex_enter(&vmc->vmc_lock);
+ }
+ mutex_exit(&vmc->vmc_lock);
+}
+
+static void
+vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size,
+ vm_object_t *vmo)
+{
+ mutex_enter(&vmc->vmc_lock);
+ VERIFY(vmc->vmc_state & VCS_HOLD);
+
+ /*
+ * With the current vCPU exclusion invariants in place, we do not expect
+ * a vCPU to be in guest context during an unmap.
+ */
+ VERIFY0(vmc->vmc_state & VCS_ON_CPU);
+
+ /*
+ * Any holds against the unmapped region need to establish their own
+ * reference to the underlying object to avoid a potential
+ * use-after-free.
+ */
+ for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
+ vmp != NULL;
+ vmp = list_next(&vmc->vmc_held_pages, vmc)) {
+ if (vmp->vmp_gpa < addr ||
+ vmp->vmp_gpa >= (addr + size)) {
+ /* Hold outside region in question */
+ continue;
+ }
+ if (vmp->vmp_obj_ref == NULL) {
+ vm_object_reference(vmo);
+ vmp->vmp_obj_ref = vmo;
+ /* For an unmapped region, PTE is now meaningless */
+ vmp->vmp_ptep = NULL;
+ } else {
+ /*
+ * Object could have gone through cycle of
+ * unmap-map-unmap before the hold was released.
+ */
+ VERIFY3P(vmp->vmp_ptep, ==, NULL);
+ }
+ }
+ mutex_exit(&vmc->vmc_lock);
+}
+
+static vm_client_t *
+vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms)
+{
+ vm_client_t *next;
+
+ ASSERT(MUTEX_HELD(&vms->vms_lock));
+
+ mutex_enter(&vmc->vmc_lock);
+ VERIFY3P(vmc->vmc_space, ==, vms);
+ VERIFY0(vmc->vmc_state & VCS_ORPHANED);
+ if (vmc->vmc_state & VCS_DESTROY) {
+ /*
+ * This vm_client is currently undergoing destruction, so it
+ * does not need to be orphaned. Let it proceed with its own
+ * clean-up task.
+ */
+ next = list_next(&vms->vms_clients, vmc);
+ } else {
+ /*
+ * Clients are only orphaned when the containing vmspace is
+ * being torn down. All mappings from the vmspace should
+ * already be gone, meaning any remaining held pages should have
+ * direct references to the object.
+ */
+ for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
+ vmp != NULL;
+ vmp = list_next(&vmc->vmc_held_pages, vmp)) {
+ ASSERT3P(vmp->vmp_ptep, ==, NULL);
+ ASSERT3P(vmp->vmp_obj_ref, !=, NULL);
+ }
+
+ /*
+ * After this point, the client will be orphaned, unable to
+ * establish new page holds (or access any vmspace-related
+ * resources) and is in charge of cleaning up after itself.
+ */
+ vmc->vmc_state |= VCS_ORPHANED;
+ next = list_next(&vms->vms_clients, vmc);
+ list_remove(&vms->vms_clients, vmc);
+ vmc->vmc_space = NULL;
+ }
+ mutex_exit(&vmc->vmc_lock);
+ return (next);
+}
+
+/*
+ * Attempt to hold a page at `gpa` inside the referenced vmspace.
+ */
+vm_page_t *
+vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot)
+{
+ vmspace_t *vms = vmc->vmc_space;
+ vm_page_t *vmp;
+ pfn_t pfn = PFN_INVALID;
+ uint64_t *ptep = NULL;
+
+ ASSERT0(gpa & PAGEOFFSET);
+ ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE);
+
+ vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP);
+ if (vmc_activate(vmc) != 0) {
+ kmem_free(vmp, sizeof (*vmp));
+ return (NULL);
+ }
+
+ if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) {
+ vmc_deactivate(vmc);
+ kmem_free(vmp, sizeof (*vmp));
+ return (NULL);
+ }
+ ASSERT(pfn != PFN_INVALID && ptep != NULL);
+
+ vmp->vmp_client = vmc;
+ vmp->vmp_chain = NULL;
+ vmp->vmp_gpa = gpa;
+ vmp->vmp_pfn = pfn;
+ vmp->vmp_ptep = ptep;
+ vmp->vmp_obj_ref = NULL;
+ vmp->vmp_prot = prot;
+ list_insert_tail(&vmc->vmc_held_pages, vmp);
+ vmc_deactivate(vmc);
+
+ return (vmp);
+}
+
+int
+vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot)
+{
+ vmspace_t *vms = vmc->vmc_space;
+ int err;
+
+ err = vmc_activate(vmc);
+ if (err == 0) {
+ err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL);
+ vmc_deactivate(vmc);
+ }
+
+ return (err);
+}
+
+/*
+ * Allocate an additional vm_client_t, based on an existing one. Only the
+ * associatation with the vmspace is cloned, not existing holds or any
+ * configured invalidation function.
+ */
+vm_client_t *
+vmc_clone(vm_client_t *vmc)
+{
+ vmspace_t *vms = vmc->vmc_space;
+
+ return (vmspace_client_alloc(vms));
+}
+
+/*
+ * Register a function (and associated data pointer) to be called when an
+ * address range in the vmspace is invalidated.
+ */
+int
+vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data)
+{
+ int err;
+
+ err = vmc_activate(vmc);
+ if (err == 0) {
+ vmc->vmc_inval_func = func;
+ vmc->vmc_inval_data = data;
+ vmc_deactivate(vmc);
+ }
+
+ return (err);
+}
+
+/*
+ * Destroy a vm_client_t instance.
+ *
+ * No pages held through this vm_client_t may be outstanding when performing a
+ * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to
+ * vmc_table_exit() has been made).
+ */
+void
+vmc_destroy(vm_client_t *vmc)
+{
+ mutex_enter(&vmc->vmc_lock);
+
+ VERIFY(list_is_empty(&vmc->vmc_held_pages));
+ VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
+
+ if ((vmc->vmc_state & VCS_ORPHANED) == 0) {
+ vmspace_t *vms;
+
+ /*
+ * Deassociation with the parent vmspace must be done carefully:
+ * The vmspace could attempt to orphan this vm_client while we
+ * release vmc_lock in order to take vms_lock (the required
+ * order). The client is marked to indicate that destruction is
+ * under way. Doing so prevents any racing orphan operation
+ * from applying to this client, allowing us to deassociate from
+ * the vmspace safely.
+ */
+ vmc->vmc_state |= VCS_DESTROY;
+ vms = vmc->vmc_space;
+ mutex_exit(&vmc->vmc_lock);
+
+ mutex_enter(&vms->vms_lock);
+ mutex_enter(&vmc->vmc_lock);
+ list_remove(&vms->vms_clients, vmc);
+ /*
+ * If the vmspace began its own destruction operation while we
+ * were navigating the locks, be sure to notify it about this
+ * vm_client being deassociated.
+ */
+ cv_signal(&vms->vms_cv);
+ mutex_exit(&vmc->vmc_lock);
+ mutex_exit(&vms->vms_lock);
+ } else {
+ VERIFY3P(vmc->vmc_space, ==, NULL);
+ mutex_exit(&vmc->vmc_lock);
+ }
+
+ mutex_destroy(&vmc->vmc_lock);
+ cv_destroy(&vmc->vmc_cv);
+ list_destroy(&vmc->vmc_held_pages);
+
+ kmem_free(vmc, sizeof (*vmc));
+}
+
+static __inline void *
+vmp_ptr(const vm_page_t *vmp)
+{
+ ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID);
+
+ const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT);
+ return ((void *)((uintptr_t)kpm_vbase + paddr));
+}
+
+/*
+ * Get a readable kernel-virtual pointer for a held page.
+ *
+ * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold()
+ * call to acquire this page reference.
+ */
+const void *
+vmp_get_readable(const vm_page_t *vmp)
+{
+ ASSERT(vmp->vmp_prot & PROT_READ);
+
+ return (vmp_ptr(vmp));
+}
+
+/*
+ * Get a writable kernel-virtual pointer for a held page.
+ *
+ * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold()
+ * call to acquire this page reference.
+ */
+void *
+vmp_get_writable(const vm_page_t *vmp)
+{
+ ASSERT(vmp->vmp_prot & PROT_WRITE);
+
+ return (vmp_ptr(vmp));
+}
+
+/*
+ * Get the host-physical PFN for a held page.
+ */
+pfn_t
+vmp_get_pfn(const vm_page_t *vmp)
+{
+ return (vmp->vmp_pfn);
+}
+
+/*
+ * Store a pointer to `to_chain` in the page-chaining slot of `vmp`.
+ */
+void
+vmp_chain(vm_page_t *vmp, vm_page_t *to_chain)
+{
+ ASSERT3P(vmp->vmp_chain, ==, NULL);
+
+ vmp->vmp_chain = to_chain;
+}
+
+/*
+ * Retrieve the pointer from the page-chaining in `vmp`.
+ */
+vm_page_t *
+vmp_next(const vm_page_t *vmp)
+{
+ return (vmp->vmp_chain);
+}
+
+static __inline bool
+vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc)
+{
+ ASSERT(MUTEX_HELD(&vmc->vmc_lock));
+
+ bool was_unmapped = false;
+
+ list_remove(&vmc->vmc_held_pages, vmp);
+ if (vmp->vmp_obj_ref != NULL) {
+ ASSERT3P(vmp->vmp_ptep, ==, NULL);
+
+ vm_object_release(vmp->vmp_obj_ref);
+ was_unmapped = true;
+ } else {
+ ASSERT3P(vmp->vmp_ptep, !=, NULL);
+
+ if ((vmp->vmp_prot & PROT_WRITE) != 0 && vmc->vmc_track_dirty) {
+ vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt;
+ vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true);
+ }
+ }
+ kmem_free(vmp, sizeof (*vmp));
+ return (was_unmapped);
+}
+
+/*
+ * Release held page. Returns true if page resided on region which was
+ * subsequently unmapped.
+ */
+bool
+vmp_release(vm_page_t *vmp)
+{
+ vm_client_t *vmc = vmp->vmp_client;
+
+ VERIFY(vmc != NULL);
+
+ mutex_enter(&vmc->vmc_lock);
+ const bool was_unmapped = vmp_release_inner(vmp, vmc);
+ mutex_exit(&vmc->vmc_lock);
+ return (was_unmapped);
+}
+
+/*
+ * Release a chain of pages which were associated via vmp_chain() (setting
+ * page-chaining pointer). Returns true if any pages resided upon a region
+ * which was subsequently unmapped.
+ *
+ * All of those pages must have been held through the same vm_client_t.
+ */
+bool
+vmp_release_chain(vm_page_t *vmp)
+{
+ vm_client_t *vmc = vmp->vmp_client;
+ bool any_unmapped = false;
+
+ ASSERT(vmp != NULL);
+
+ mutex_enter(&vmc->vmc_lock);
+ while (vmp != NULL) {
+ vm_page_t *next = vmp->vmp_chain;
+
+ /* We expect all pages in chain to be from same client */
+ ASSERT3P(vmp->vmp_client, ==, vmc);
+
+ if (vmp_release_inner(vmp, vmc)) {
+ any_unmapped = true;
+ }
+ vmp = next;
+ }
+ mutex_exit(&vmc->vmc_lock);
+ return (any_unmapped);
+}
+
+
+int
+vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len,
+ struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
+{
+ vm_object_t *vmo;
+ int err;
+
+ if (segoff < 0 || len <= 0 ||
+ (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) {
+ return (EINVAL);
+ }
+ if ((prot & PROT_USER) == 0) {
+ return (ENOTSUP);
+ }
+ err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
+ if (err != 0) {
+ return (err);
+ }
+
+ VERIFY(segoff >= 0);
+ VERIFY(len <= vmo->vmo_size);
+ VERIFY((len + segoff) <= vmo->vmo_size);
+
+ if (vmo->vmo_type != VMOT_MEM) {
+ /* Only support memory objects for now */
+ return (ENOTSUP);
+ }
+
+ as_rangelock(as);
+
+ err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags);
+ if (err == 0) {
+ segvmm_crargs_t svma;
+
+ svma.prot = prot;
+ svma.offset = segoff;
+ svma.vmo = vmo;
+ svma.vmc = NULL;
+
+ err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma);
+ }
+
+ as_rangeunlock(as);
+ return (err);
+}
+
+int
+vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp,
+ off_t len, uint_t prot, uint_t maxprot, uint_t flags)
+{
+
+ const uintptr_t gpa = (uintptr_t)off;
+ const size_t size = (uintptr_t)len;
+ int err;
+
+ if (off < 0 || len <= 0 ||
+ (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
+ return (EINVAL);
+ }
+ if ((prot & PROT_USER) == 0) {
+ return (ENOTSUP);
+ }
+
+ as_rangelock(as);
+
+ err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
+ if (err == 0) {
+ segvmm_crargs_t svma;
+
+ svma.prot = prot;
+ svma.offset = gpa;
+ svma.vmo = NULL;
+ svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm));
+
+ err = as_map(as, *addrp, len, segvmm_create, &svma);
+ }
+
+ as_rangeunlock(as);
+ return (err);
+}
diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h
index 31b63dfe69..f1241a9183 100644
--- a/usr/src/uts/intel/sys/x86_archext.h
+++ b/usr/src/uts/intel/sys/x86_archext.h
@@ -526,9 +526,21 @@ extern "C" {
#define IA32_VMX_PROCBASED2_VPID (1UL << 5)
#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c
-#define IA32_VMX_EPT_VPID_INVEPT (1UL << 20)
-#define IA32_VMX_EPT_VPID_INVEPT_SINGLE (1UL << 25)
-#define IA32_VMX_EPT_VPID_INVEPT_ALL (1UL << 26)
+#define IA32_VMX_EPT_VPID_EXEC_ONLY (1UL << 0)
+#define IA32_VMX_EPT_VPID_PWL4 (1UL << 6)
+#define IA32_VMX_EPT_VPID_TYPE_UC (1UL << 8)
+#define IA32_VMX_EPT_VPID_TYPE_WB (1UL << 14)
+#define IA32_VMX_EPT_VPID_MAP_2M (1UL << 16)
+#define IA32_VMX_EPT_VPID_MAP_1G (1UL << 17)
+#define IA32_VMX_EPT_VPID_HW_AD (1UL << 21)
+#define IA32_VMX_EPT_VPID_INVEPT (1UL << 20)
+#define IA32_VMX_EPT_VPID_INVEPT_SINGLE (1UL << 25)
+#define IA32_VMX_EPT_VPID_INVEPT_ALL (1UL << 26)
+#define IA32_VMX_EPT_VPID_INVVPID (1UL << 32)
+#define IA32_VMX_EPT_VPID_INVVPID_ADDR (1UL << 40)
+#define IA32_VMX_EPT_VPID_INVVPID_SINGLE (1UL << 41)
+#define IA32_VMX_EPT_VPID_INVVPID_ALL (1UL << 42)
+#define IA32_VMX_EPT_VPID_INVVPID_RETAIN (1UL << 43)
/*
* Intel TSX Control MSRs