diff options
author | Patrick Mooney <pmooney@pfmooney.com> | 2021-09-05 01:38:39 +0000 |
---|---|---|
committer | Patrick Mooney <pmooney@oxide.computer> | 2021-11-19 23:00:59 +0000 |
commit | 0153d828c132fdb1a17c11b99386a3d1b87994cf (patch) | |
tree | c670df2f1d9cfceb92709c3cb2862fdd1f97f90a | |
parent | d8f839f91e21bea2f5200f95df55608cbecdeeb9 (diff) | |
download | illumos-joyent-0153d828c132fdb1a17c11b99386a3d1b87994cf.tar.gz |
13896 bhyve VM interfaces should be better fit
13981 bhyve emulation should set dirty bits
Reviewed by: Dan Cross <cross@oxidecomputer.com>
Reviewed by: Joshua M. Clulow <josh@sysmgr.org>
Approved by: Dan McDonald <danmcd@joyent.com>
40 files changed, 2152 insertions, 2960 deletions
diff --git a/usr/src/compat/bhyve/amd64/machine/md_var.h b/usr/src/compat/bhyve/amd64/machine/md_var.h index ed57a8bebc..ca3d68ef95 100644 --- a/usr/src/compat/bhyve/amd64/machine/md_var.h +++ b/usr/src/compat/bhyve/amd64/machine/md_var.h @@ -23,6 +23,4 @@ extern char cpu_vendor[]; /* CPU Origin code */ #include <sys/systm.h> -#define Maxmem (physmax + 1) - #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_ */ diff --git a/usr/src/compat/bhyve/amd64/machine/pmap.h b/usr/src/compat/bhyve/amd64/machine/pmap.h deleted file mode 100644 index 3b94d1b1a9..0000000000 --- a/usr/src/compat/bhyve/amd64/machine/pmap.h +++ /dev/null @@ -1,489 +0,0 @@ -/* - * All rights reserved. This copyright notice is Copyright Management - * Information under 17 USC 1202 and is included to protect this work and - * deter copyright infringement. Removal or alteration of this Copyright - * Management Information without the express written permission from - * Pluribus Networks Inc is prohibited, and any such unauthorized removal - * or alteration will be a violation of federal law. - * - * Copyright (c) 2003 Peter Wemm. - * Copyright (c) 1991 Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * the Systems Programming Group of the University of Utah Computer - * Science Department and William Jolitz of UUNET Technologies Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Derived from hp300 version by Mike Hibler, this version by William - * Jolitz uses a recursive map [a pde points to the page directory] to - * map the page tables using the pagetables themselves. This is done to - * reduce the impact on kernel virtual memory for lots of sparse address - * space, and to reduce the cost of memory to each process. - * - * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 - * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 - * $FreeBSD$ - */ - -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2014 Pluribus Networks Inc. - */ - - -#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ -#define _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ - -/* - * Page-directory and page-table entries follow this format, with a few - * of the fields not present here and there, depending on a lot of things. - */ - /* ---- Intel Nomenclature ---- */ -#define X86_PG_V 0x001 /* P Valid */ -#define X86_PG_RW 0x002 /* R/W Read/Write */ -#define X86_PG_U 0x004 /* U/S User/Supervisor */ -#define X86_PG_NC_PWT 0x008 /* PWT Write through */ -#define X86_PG_NC_PCD 0x010 /* PCD Cache disable */ -#define X86_PG_A 0x020 /* A Accessed */ -#define X86_PG_M 0x040 /* D Dirty */ -#define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ -#define X86_PG_PTE_PAT 0x080 /* PAT PAT index */ -#define X86_PG_G 0x100 /* G Global */ -#define X86_PG_AVAIL1 0x200 /* / Available for system */ -#define X86_PG_AVAIL2 0x400 /* < programmers use */ -#define X86_PG_AVAIL3 0x800 /* \ */ -#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */ -#define X86_PG_NX (1ul<<63) /* No-execute */ -#define X86_PG_AVAIL(x) (1ul << (x)) - -/* Page level cache control fields used to determine the PAT type */ -#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) -#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) - -/* - * Intel extended page table (EPT) bit definitions. - */ -#define EPT_PG_READ 0x001 /* R Read */ -#define EPT_PG_WRITE 0x002 /* W Write */ -#define EPT_PG_EXECUTE 0x004 /* X Execute */ -#define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */ -#define EPT_PG_PS 0x080 /* PS Page size */ -#define EPT_PG_A 0x100 /* A Accessed */ -#define EPT_PG_M 0x200 /* D Dirty */ -#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */ - -/* - * Define the PG_xx macros in terms of the bits on x86 PTEs. - */ -#define PG_V X86_PG_V -#define PG_RW X86_PG_RW -#define PG_U X86_PG_U -#define PG_NC_PWT X86_PG_NC_PWT -#define PG_NC_PCD X86_PG_NC_PCD -#define PG_A X86_PG_A -#define PG_M X86_PG_M -#define PG_PS X86_PG_PS -#define PG_PTE_PAT X86_PG_PTE_PAT -#define PG_G X86_PG_G -#define PG_AVAIL1 X86_PG_AVAIL1 -#define PG_AVAIL2 X86_PG_AVAIL2 -#define PG_AVAIL3 X86_PG_AVAIL3 -#define PG_PDE_PAT X86_PG_PDE_PAT -#define PG_NX X86_PG_NX -#define PG_PDE_CACHE X86_PG_PDE_CACHE -#define PG_PTE_CACHE X86_PG_PTE_CACHE - -/* Our various interpretations of the above */ -#define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */ -#define PG_MANAGED X86_PG_AVAIL2 -#define EPT_PG_EMUL_V X86_PG_AVAIL(52) -#define EPT_PG_EMUL_RW X86_PG_AVAIL(53) -#define PG_PROMOTED X86_PG_AVAIL(54) /* PDE only */ -#define PG_FRAME (0x000ffffffffff000ul) -#define PG_PS_FRAME (0x000fffffffe00000ul) - -/* - * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB - * (PTE) page mappings have identical settings for the following fields: - */ -#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \ - PG_M | PG_A | PG_U | PG_RW | PG_V) - -/* - * Page Protection Exception bits - */ - -#define PGEX_P 0x01 /* Protection violation vs. not present */ -#define PGEX_W 0x02 /* during a Write cycle */ -#define PGEX_U 0x04 /* access from User mode (UPL) */ -#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */ -#define PGEX_I 0x10 /* during an instruction fetch */ - -/* - * undef the PG_xx macros that define bits in the regular x86 PTEs that - * have a different position in nested PTEs. This is done when compiling - * code that needs to be aware of the differences between regular x86 and - * nested PTEs. - * - * The appropriate bitmask will be calculated at runtime based on the pmap - * type. - */ -#ifdef AMD64_NPT_AWARE -#undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */ -#undef PG_G -#undef PG_A -#undef PG_M -#undef PG_PDE_PAT -#undef PG_PDE_CACHE -#undef PG_PTE_PAT -#undef PG_PTE_CACHE -#undef PG_RW -#undef PG_V -#endif - -/* - * Pte related macros. This is complicated by having to deal with - * the sign extension of the 48th bit. - */ -#define KVADDR(l4, l3, l2, l1) ( \ - ((unsigned long)-1 << 47) | \ - ((unsigned long)(l4) << PML4SHIFT) | \ - ((unsigned long)(l3) << PDPSHIFT) | \ - ((unsigned long)(l2) << PDRSHIFT) | \ - ((unsigned long)(l1) << PAGE_SHIFT)) - -#define UVADDR(l4, l3, l2, l1) ( \ - ((unsigned long)(l4) << PML4SHIFT) | \ - ((unsigned long)(l3) << PDPSHIFT) | \ - ((unsigned long)(l2) << PDRSHIFT) | \ - ((unsigned long)(l1) << PAGE_SHIFT)) - -/* - * Number of kernel PML4 slots. Can be anywhere from 1 to 64 or so, - * but setting it larger than NDMPML4E makes no sense. - * - * Each slot provides .5 TB of kernel virtual space. - */ -#define NKPML4E 4 - -#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */ -#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */ -#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */ - -/* - * NDMPML4E is the maximum number of PML4 entries that will be - * used to implement the direct map. It must be a power of two, - * and should generally exceed NKPML4E. The maximum possible - * value is 64; using 128 will make the direct map intrude into - * the recursive page table map. - */ -#define NDMPML4E 8 - -/* - * These values control the layout of virtual memory. The starting address - * of the direct map, which is controlled by DMPML4I, must be a multiple of - * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.) - * - * Note: KPML4I is the index of the (single) level 4 page that maps - * the KVA that holds KERNBASE, while KPML4BASE is the index of the - * first level 4 page that maps VM_MIN_KERNEL_ADDRESS. If NKPML4E - * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra - * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to - * KERNBASE. - * - * (KPML4I combines with KPDPI to choose where KERNBASE starts. - * Or, in other words, KPML4I provides bits 39..47 of KERNBASE, - * and KPDPI provides bits 30..38.) - */ -#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */ - -#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ -#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */ - -#define KPML4I (NPML4EPG-1) -#define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ - -/* - * XXX doesn't really belong here I guess... - */ -#define ISA_HOLE_START 0xa0000 -#define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START) - -#define PMAP_PCID_NONE 0xffffffff -#define PMAP_PCID_KERN 0 -#define PMAP_PCID_OVERMAX 0x1000 - -#ifndef LOCORE - -#ifdef __FreeBSD__ -#include <sys/queue.h> -#include <sys/_cpuset.h> -#include <sys/_lock.h> -#include <sys/_mutex.h> - -#include <vm/_vm_radix.h> -#endif /* __FreeBSD__ */ - -typedef u_int64_t pd_entry_t; -typedef u_int64_t pt_entry_t; -typedef u_int64_t pdp_entry_t; -typedef u_int64_t pml4_entry_t; - -/* - * Address of current address space page table maps and directories. - */ -#ifdef _KERNEL -#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0)) -#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0)) -#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0)) -#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)) -#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t))) -#define PTmap ((pt_entry_t *)(addr_PTmap)) -#define PDmap ((pd_entry_t *)(addr_PDmap)) -#define PDPmap ((pd_entry_t *)(addr_PDPmap)) -#define PML4map ((pd_entry_t *)(addr_PML4map)) -#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e)) - -extern int nkpt; /* Initial number of kernel page tables */ -extern u_int64_t KPDPphys; /* physical address of kernel level 3 */ -extern u_int64_t KPML4phys; /* physical address of kernel level 4 */ - -/* - * virtual address to page table entry and - * to physical address. - * Note: these work recursively, thus vtopte of a pte will give - * the corresponding pde that in turn maps it. - */ -pt_entry_t *vtopte(vm_offset_t); -#define vtophys(va) pmap_kextract(((vm_offset_t) (va))) -#ifndef __FreeBSD__ -extern vm_paddr_t pmap_kextract(vm_offset_t); -#endif - -#define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) -#define pte_load_clear(ptep) atomic_swap_long(ptep, 0) -#define pte_store(ptep, pte) do { \ - *(u_long *)(ptep) = (u_long)(pte); \ -} while (0) -#define pte_clear(ptep) pte_store(ptep, 0) - -#define pde_store(pdep, pde) pte_store(pdep, pde) - -extern pt_entry_t pg_nx; - -#endif /* _KERNEL */ - -#ifdef __FreeBSD__ -/* - * Pmap stuff - */ -struct pv_entry; -struct pv_chunk; - -/* - * Locks - * (p) PV list lock - */ -struct md_page { - TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ - int pv_gen; /* (p) */ - int pat_mode; -}; -#endif /* __FreeBSD__ */ - -enum pmap_type { - PT_X86, /* regular x86 page tables */ - PT_EPT, /* Intel's nested page tables */ - PT_RVI, /* AMD's nested page tables */ -}; - -#ifdef __FreeBSD__ -struct pmap_pcids { - uint32_t pm_pcid; - uint32_t pm_gen; -}; - -/* - * The kernel virtual address (KVA) of the level 4 page table page is always - * within the direct map (DMAP) region. - */ -struct pmap { - struct mtx pm_mtx; - pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ - uint64_t pm_cr3; - TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ - cpuset_t pm_active; /* active on cpus */ - enum pmap_type pm_type; /* regular or nested tables */ - struct pmap_statistics pm_stats; /* pmap statistics */ - struct vm_radix pm_root; /* spare page table pages */ - long pm_eptgen; /* EPT pmap generation id */ - int pm_flags; - struct pmap_pcids pm_pcids[MAXCPU]; -}; -#endif /* __FreeBSD__ */ - -/* flags */ -#define PMAP_NESTED_IPIMASK 0xff -#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ -#define PMAP_EMULATE_AD_BITS (1 << 9) /* needs A/D bits emulation */ -#define PMAP_SUPPORTS_EXEC_ONLY (1 << 10) /* execute only mappings ok */ - -typedef struct pmap *pmap_t; - -#ifdef _KERNEL -extern struct pmap kernel_pmap_store; -#define kernel_pmap (&kernel_pmap_store) - -#define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) -#define PMAP_LOCK_ASSERT(pmap, type) \ - mtx_assert(&(pmap)->pm_mtx, (type)) -#define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) -#define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \ - NULL, MTX_DEF | MTX_DUPOK) -#define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) -#define PMAP_MTX(pmap) (&(pmap)->pm_mtx) -#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) -#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) - -int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags); -int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); -#endif - -#ifdef __FreeBSD__ -/* - * For each vm_page_t, there is a list of all currently valid virtual - * mappings of that page. An entry is a pv_entry_t, the list is pv_list. - */ -typedef struct pv_entry { - vm_offset_t pv_va; /* virtual address for mapping */ - TAILQ_ENTRY(pv_entry) pv_next; -} *pv_entry_t; - -/* - * pv_entries are allocated in chunks per-process. This avoids the - * need to track per-pmap assignments. - */ -#define _NPCM 3 -#define _NPCPV 168 -struct pv_chunk { - pmap_t pc_pmap; - TAILQ_ENTRY(pv_chunk) pc_list; - uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ - TAILQ_ENTRY(pv_chunk) pc_lru; - struct pv_entry pc_pventry[_NPCPV]; -}; - -#ifdef _KERNEL - -extern caddr_t CADDR1; -extern pt_entry_t *CMAP1; -extern vm_paddr_t phys_avail[]; -extern vm_paddr_t dump_avail[]; -extern vm_offset_t virtual_avail; -extern vm_offset_t virtual_end; -extern vm_paddr_t dmaplimit; -extern int pmap_pcid_enabled; -extern int invpcid_works; - -#define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) -#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) -#define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) - -struct thread; - -void pmap_activate_sw(struct thread *); -void pmap_bootstrap(vm_paddr_t *); -int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde); -int pmap_change_attr(vm_offset_t, vm_size_t, int); -void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate); -void pmap_init_pat(void); -void pmap_kenter(vm_offset_t va, vm_paddr_t pa); -void *pmap_kenter_temporary(vm_paddr_t pa, int i); -vm_paddr_t pmap_kextract(vm_offset_t); -void pmap_kremove(vm_offset_t); -void *pmap_mapbios(vm_paddr_t, vm_size_t); -void *pmap_mapdev(vm_paddr_t, vm_size_t); -void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); -boolean_t pmap_page_is_mapped(vm_page_t m); -void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma); -void pmap_pinit_pml4(vm_page_t); -void pmap_unmapdev(vm_offset_t, vm_size_t); -void pmap_invalidate_page(pmap_t, vm_offset_t); -void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); -void pmap_invalidate_all(pmap_t); -void pmap_invalidate_cache(void); -void pmap_invalidate_cache_pages(vm_page_t *pages, int count); -void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, - boolean_t force); -void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); -boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); -void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); -#endif /* _KERNEL */ - -/* Return various clipped indexes for a given VA */ -static __inline vm_pindex_t -pmap_pte_index(vm_offset_t va) -{ - - return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); -} - -static __inline vm_pindex_t -pmap_pde_index(vm_offset_t va) -{ - - return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); -} - -static __inline vm_pindex_t -pmap_pdpe_index(vm_offset_t va) -{ - - return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); -} - -static __inline vm_pindex_t -pmap_pml4e_index(vm_offset_t va) -{ - - return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); -} - -#endif /* __FreeBSD__ */ -#endif /* !LOCORE */ - -#endif /* !_COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */ diff --git a/usr/src/compat/bhyve/amd64/machine/smp.h b/usr/src/compat/bhyve/amd64/machine/smp.h deleted file mode 100644 index 9c4f2d111b..0000000000 --- a/usr/src/compat/bhyve/amd64/machine/smp.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2013 Pluribus Networks Inc. - * Copyright 2018 Joyent, Inc. - */ - -#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ -#define _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ - -#ifdef _KERNEL - -/* - * APIC-related functions are replaced with native calls rather than shims - * which attempt to replicate the FreeBSD interfaces. This is empty, but will - * remain present to appease sources which wish to include the path. - */ - -#endif /* _KERNEL */ - -#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ */ diff --git a/usr/src/compat/bhyve/sys/smp.h b/usr/src/compat/bhyve/sys/smp.h deleted file mode 100644 index 3d6413ce16..0000000000 --- a/usr/src/compat/bhyve/sys/smp.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2014 Pluribus Networks Inc. - * Copyright 2017 Joyent, Inc. - */ - -#ifndef _COMPAT_FREEBSD_SYS_SMP_H_ -#define _COMPAT_FREEBSD_SYS_SMP_H_ - -#include <sys/cpuset.h> - -#define IPI_AST 0 - -void ipi_cpu(int cpu, u_int ipi); - -#endif /* _COMPAT_FREEBSD_SYS_SMP_H_ */ diff --git a/usr/src/contrib/bhyve/amd64/machine/vm.h b/usr/src/contrib/bhyve/amd64/machine/vm.h deleted file mode 100644 index 885c1607ea..0000000000 --- a/usr/src/contrib/bhyve/amd64/machine/vm.h +++ /dev/null @@ -1,45 +0,0 @@ -/*- - * Copyright (c) 2009 Advanced Computing Technologies LLC - * Written by: John H. Baldwin <jhb@FreeBSD.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: head/sys/amd64/include/vm.h 233671 2012-03-29 16:51:22Z jhb $ - */ - -#ifndef _MACHINE_VM_H_ -#define _MACHINE_VM_H_ - -#include <machine/specialreg.h> - -/* Memory attributes. */ -#define VM_MEMATTR_UNCACHEABLE ((vm_memattr_t)PAT_UNCACHEABLE) -#define VM_MEMATTR_WRITE_COMBINING ((vm_memattr_t)PAT_WRITE_COMBINING) -#define VM_MEMATTR_WRITE_THROUGH ((vm_memattr_t)PAT_WRITE_THROUGH) -#define VM_MEMATTR_WRITE_PROTECTED ((vm_memattr_t)PAT_WRITE_PROTECTED) -#define VM_MEMATTR_WRITE_BACK ((vm_memattr_t)PAT_WRITE_BACK) -#define VM_MEMATTR_WEAK_UNCACHEABLE ((vm_memattr_t)PAT_UNCACHED) - -#define VM_MEMATTR_DEFAULT VM_MEMATTR_WRITE_BACK - -#endif /* !_MACHINE_VM_H_ */ diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index caa660725c..9b83a780a5 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -247,7 +247,6 @@ VMM_OBJS += vmm.o \ vmm_instruction_emul.o \ vmm_ioport.o \ vmm_lapic.o \ - vmm_mem.o \ vmm_stat.o \ vmm_util.o \ x86.o \ @@ -259,7 +258,6 @@ VMM_OBJS += vmm.o \ vlapic.o \ vrtc.o \ vpmtmr.o \ - ept.o \ vmcs.o \ vmx_msr.o \ vmx.o \ @@ -268,18 +266,17 @@ VMM_OBJS += vmm.o \ vtd_sol.o \ svm.o \ svm_msr.o \ - npt.o \ vmcb.o \ svm_support.o \ amdv.o \ vmm_gpt.o \ seg_vmm.o \ vmm_reservoir.o \ - vmm_sol_vm.o \ vmm_sol_glue.o \ vmm_sol_ept.o \ vmm_sol_rvi.o \ vmm_support.o \ + vmm_vm.o \ vmm_zsd.o VIONA_OBJS += viona_main.o \ diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c index c7b43b85ef..c381e350ed 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c +++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c @@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/pcpu.h> #include <sys/rman.h> -#include <sys/smp.h> #include <sys/sysctl.h> #include <dev/pci/pcivar.h> @@ -45,7 +44,6 @@ __FBSDID("$FreeBSD$"); #include <machine/resource.h> #include <machine/vmm.h> -#include <machine/pmap.h> #include <machine/vmparam.h> #include <machine/pci_cfgreg.h> diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.c b/usr/src/uts/i86pc/io/vmm/amd/npt.c deleted file mode 100644 index 6fc6825242..0000000000 --- a/usr/src/uts/i86pc/io/vmm/amd/npt.c +++ /dev/null @@ -1,77 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/kernel.h> -#include <sys/systm.h> -#include <sys/sysctl.h> - -#include <sys/vmm_vm.h> - -#include "npt.h" - -static int npt_flags; - -#define NPT_IPIMASK 0xFF - -/* - * AMD nested page table init. - */ -int -svm_npt_init(int ipinum) -{ - int enable_superpage = 1; - - npt_flags = ipinum & NPT_IPIMASK; - TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage); - if (enable_superpage) - npt_flags |= PMAP_PDE_SUPERPAGE; - - return (0); -} - -static int -npt_pinit(pmap_t pmap) -{ - return (pmap_pinit_type(pmap, PT_RVI, npt_flags)); -} - -struct vmspace * -svm_npt_alloc(vm_offset_t min, vm_offset_t max) -{ - return (vmspace_alloc(min, max, npt_pinit)); -} - -void -svm_npt_free(struct vmspace *vmspace) -{ - vmspace_free(vmspace); -} diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.h b/usr/src/uts/i86pc/io/vmm/amd/npt.h deleted file mode 100644 index 95f3fbab9e..0000000000 --- a/usr/src/uts/i86pc/io/vmm/amd/npt.h +++ /dev/null @@ -1,38 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _SVM_NPT_H_ -#define _SVM_NPT_H_ - -int svm_npt_init(int ipinum); -struct vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max); -void svm_npt_free(struct vmspace *vmspace); - -#endif /* _SVM_NPT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c index 65fc4c3d0f..8ffc1c6557 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm.c +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c @@ -45,7 +45,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> -#include <sys/smp.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/pcpu.h> @@ -60,7 +59,6 @@ __FBSDID("$FreeBSD$"); #include <machine/md_var.h> #include <machine/reg.h> #include <machine/specialreg.h> -#include <machine/smp.h> #include <machine/vmm.h> #include <machine/vmm_dev.h> #include <sys/vmm_instruction_emul.h> @@ -79,7 +77,6 @@ __FBSDID("$FreeBSD$"); #include "svm.h" #include "svm_softc.h" #include "svm_msr.h" -#include "npt.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, @@ -151,12 +148,11 @@ svm_cleanup(void) } static int -svm_init(int ipinum) +svm_init(void) { vmcb_clean &= VMCB_CACHE_DEFAULT; svm_msr_init(); - svm_npt_init(ipinum); return (0); } @@ -425,7 +421,7 @@ vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, * Initialize a virtual machine. */ static void * -svm_vminit(struct vm *vm, pmap_t pmap) +svm_vminit(struct vm *vm) { struct svm_softc *svm_sc; struct svm_vcpu *vcpu; @@ -447,7 +443,7 @@ svm_vminit(struct vm *vm, pmap_t pmap) panic("contigmalloc of SVM IO bitmap failed"); svm_sc->vm = vm; - svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); + svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm)); /* * Intercept read and write accesses to all MSRs. @@ -1776,23 +1772,21 @@ svm_inject_recheck(struct svm_softc *sc, int vcpu, static void -check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, uint_t thiscpu) +check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen) { struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); - long eptgen; uint8_t flush; - eptgen = pmap->pm_eptgen; flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(), - vcpustate->eptgen != eptgen); + vcpustate->nptgen != nptgen); if (flush != VMCB_TLB_FLUSH_NOTHING) { ctrl->asid = vcpustate->hma_asid.hsa_asid; svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); } ctrl->tlb_ctrl = flush; - vcpustate->eptgen = eptgen; + vcpustate->nptgen = nptgen; } static void @@ -1810,8 +1804,8 @@ flush_asid(struct svm_softc *sc, int vcpuid) ctrl->tlb_ctrl = flush; svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); /* - * A potential future optimization: We could choose to update the eptgen - * associated with the vCPU, since any pending eptgen change requiring a + * A potential future optimization: We could choose to update the nptgen + * associated with the vCPU, since any pending nptgen change requiring a * flush will be satisfied by the one which has just now been queued. */ } @@ -1899,7 +1893,7 @@ svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid) * Start vcpu with specified RIP. */ static int -svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap) +svm_vmrun(void *arg, int vcpu, uint64_t rip) { struct svm_regctx *gctx; struct svm_softc *svm_sc; @@ -1908,6 +1902,7 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap) struct vmcb_ctrl *ctrl; struct vm_exit *vmexit; struct vlapic *vlapic; + vm_client_t *vmc; struct vm *vm; uint64_t vmcb_pa; int handled; @@ -1921,6 +1916,7 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap) ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); vmexit = vm_exitinfo(vm, vcpu); vlapic = vm_lapic(vm, vcpu); + vmc = vm_get_vmclient(vm, vcpu); gctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; @@ -1962,6 +1958,7 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap) do { enum event_inject_state inject_state; + uint64_t nptgen; /* * Initial event injection is complex and may involve mutex @@ -2021,14 +2018,12 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap) */ ldt_sel = sldt(); - /* Activate the nested pmap on 'curcpu' */ - CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active); - /* - * Check the pmap generation and the ASID generation to - * ensure that the vcpu does not use stale TLB mappings. + * Check the vmspace and ASID generations to ensure that the + * vcpu does not use stale TLB mappings. */ - check_asid(svm_sc, vcpu, pmap, curcpu); + nptgen = vmc_table_enter(vmc); + check_asid(svm_sc, vcpu, curcpu, nptgen); ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; vcpustate->dirty = 0; @@ -2042,14 +2037,14 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap) svm_dr_leave_guest(gctx); vcpu_ustate_change(vm, vcpu, VU_EMU_KERN); - CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); - /* Restore host LDTR. */ lldt(ldt_sel); /* #VMEXIT disables interrupts so re-enable them here. */ enable_gintr(); + vmc_table_exit(vmc); + /* Update 'nextrip' */ vcpustate->nextrip = state->rip; @@ -2477,6 +2472,7 @@ struct vmm_ops vmm_ops_amd = { .init = svm_init, .cleanup = svm_cleanup, .resume = svm_restore, + .vminit = svm_vminit, .vmrun = svm_vmrun, .vmcleanup = svm_vmcleanup, @@ -2486,8 +2482,6 @@ struct vmm_ops vmm_ops_amd = { .vmsetdesc = svm_setdesc, .vmgetcap = svm_getcap, .vmsetcap = svm_setcap, - .vmspace_alloc = svm_npt_alloc, - .vmspace_free = svm_npt_free, .vlapic_init = svm_vlapic_init, .vlapic_cleanup = svm_vlapic_cleanup, diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h index e3ac603e71..adf9bb8ddd 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h @@ -50,7 +50,7 @@ struct svm_vcpu { uint64_t nextrip; /* next instruction to be executed by guest */ int lastcpu; /* host cpu that the vcpu last ran on */ uint32_t dirty; /* state cache bits that must be cleared */ - long eptgen; /* pmap->pm_eptgen when the vcpu last ran */ + uint64_t nptgen; /* page table gen when the vcpu last ran */ hma_svm_asid_t hma_asid; boolean_t loaded; } __aligned(PAGE_SIZE); @@ -61,7 +61,7 @@ struct svm_vcpu { struct svm_softc { uint8_t apic_page[VM_MAXCPU][PAGE_SIZE]; struct svm_vcpu vcpu[VM_MAXCPU]; - vm_offset_t nptp; /* nested page table */ + uint64_t nptp; /* nested page table (host PA) */ uint8_t *iopm_bitmap; /* shared by all vcpus */ uint8_t *msr_bitmap; /* shared by all vcpus */ struct vm *vm; diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.c b/usr/src/uts/i86pc/io/vmm/intel/ept.c deleted file mode 100644 index 49b01ebd36..0000000000 --- a/usr/src/uts/i86pc/io/vmm/intel/ept.c +++ /dev/null @@ -1,170 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2011 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * Copyright 2015 Pluribus Networks Inc. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/kernel.h> -#include <sys/types.h> -#include <sys/systm.h> -#include <sys/smp.h> -#include <sys/sysctl.h> -#include <sys/hma.h> - -#include <machine/specialreg.h> -#include <machine/vmm.h> -#include <sys/vmm_vm.h> - -#include "ept.h" - -#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0)) -#define EPT_PWL4(cap) ((cap) & (1UL << 6)) -#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) -#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ -#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ -#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) -#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21)) -#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) - -#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL -#define INVVPID_ALL_TYPES_SUPPORTED(cap) \ - (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK) - -#define INVEPT_ALL_TYPES_MASK 0x6000000UL -#define INVEPT_ALL_TYPES_SUPPORTED(cap) \ - (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) - -#define EPT_PWLEVELS 4 /* page walk levels */ -#define EPT_ENABLE_AD_BITS (1 << 6) - -SYSCTL_DECL(_hw_vmm); -SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, - NULL); - -static int ept_enable_ad_bits; - -static int ept_pmap_flags; - -int -ept_init(int ipinum) -{ - int use_hw_ad_bits, use_superpages, use_exec_only; - uint64_t cap; - - cap = rdmsr(MSR_VMX_EPT_VPID_CAP); - - /* - * Verify that: - * - page walk length is 4 steps - * - extended page tables can be laid out in write-back memory - * - invvpid instruction with all possible types is supported - * - invept instruction with all possible types is supported - */ - if (!EPT_PWL4(cap) || - !EPT_MEMORY_TYPE_WB(cap) || - !INVVPID_SUPPORTED(cap) || - !INVVPID_ALL_TYPES_SUPPORTED(cap) || - !INVEPT_SUPPORTED(cap) || - !INVEPT_ALL_TYPES_SUPPORTED(cap)) - return (EINVAL); - - ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK; - - use_superpages = 1; - TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages); - if (use_superpages && EPT_PDE_SUPERPAGE(cap)) - ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */ - - use_hw_ad_bits = 1; - TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits); - if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap)) - ept_enable_ad_bits = 1; - else - ept_pmap_flags |= PMAP_EMULATE_AD_BITS; - - use_exec_only = 1; - TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only); - if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap)) - ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY; - - return (0); -} - -void -ept_invalidate_mappings(ulong_t eptp) -{ - hma_vmx_invept_allcpus((uintptr_t)eptp); -} - -static int -ept_pinit(pmap_t pmap) -{ - - return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags)); -} - -struct vmspace * -ept_vmspace_alloc(vm_offset_t min, vm_offset_t max) -{ - - return (vmspace_alloc(min, max, ept_pinit)); -} - -void -ept_vmspace_free(struct vmspace *vmspace) -{ - - vmspace_free(vmspace); -} - -uint64_t -eptp(uint64_t pml4) -{ - uint64_t eptp_val; - - eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK; - if (ept_enable_ad_bits) - eptp_val |= EPT_ENABLE_AD_BITS; - - return (eptp_val); -} diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.h b/usr/src/uts/i86pc/io/vmm/intel/ept.h deleted file mode 100644 index e4a6d6c959..0000000000 --- a/usr/src/uts/i86pc/io/vmm/intel/ept.h +++ /dev/null @@ -1,41 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2011 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _EPT_H_ -#define _EPT_H_ - -struct vmx; - -int ept_init(int ipinum); -void ept_invalidate_mappings(ulong_t eptp); -struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max); -void ept_vmspace_free(struct vmspace *vmspace); -uint64_t eptp(uint64_t pml4); -#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/offsets.in b/usr/src/uts/i86pc/io/vmm/intel/offsets.in index d456693573..f467e7b1ca 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/offsets.in +++ b/usr/src/uts/i86pc/io/vmm/intel/offsets.in @@ -19,7 +19,6 @@ #include <sys/systm.h> #include <sys/cpuvar.h> -#include <machine/pmap.h> #include <machine/vmm.h> #include <sys/vmm_vm.h> @@ -43,18 +42,6 @@ vmxctx guest_r15 VMXCTX_GUEST_R15 guest_cr2 VMXCTX_GUEST_CR2 inst_fail_status VMXCTX_INST_FAIL_STATUS - pmap VMXCTX_PMAP - -vmx - eptgen VMX_EPTGEN - eptp VMX_EPTP - -pmap - pm_active PM_ACTIVE - pm_eptgen PM_EPTGEN - -cpu - cpu_id \#define VM_SUCCESS 0 \#define VM_FAIL_INVALID 1 diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c index c58ad471a1..533adcbbf2 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c @@ -48,7 +48,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> -#include <sys/smp.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/pcpu.h> @@ -60,13 +59,13 @@ __FBSDID("$FreeBSD$"); #include <sys/smt.h> #include <sys/hma.h> #include <sys/trap.h> +#include <sys/archsystm.h> #include <machine/psl.h> #include <machine/cpufunc.h> #include <machine/md_var.h> #include <machine/reg.h> #include <machine/segments.h> -#include <machine/smp.h> #include <machine/specialreg.h> #include <machine/vmparam.h> #include <sys/vmm_vm.h> @@ -83,7 +82,6 @@ __FBSDID("$FreeBSD$"); #include "vlapic.h" #include "vlapic_priv.h" -#include "ept.h" #include "vmcs.h" #include "vmx.h" #include "vmx_msr.h" @@ -145,6 +143,22 @@ __FBSDID("$FreeBSD$"); (VM_ENTRY_INTO_SMM | \ VM_ENTRY_DEACTIVATE_DUAL_MONITOR) +/* + * Cover the EPT capabilities used by bhyve at present: + * - 4-level page walks + * - write-back memory type + * - INVEPT operations (all types) + * - INVVPID operations (single-context only) + */ +#define EPT_CAPS_REQUIRED \ + (IA32_VMX_EPT_VPID_PWL4 | \ + IA32_VMX_EPT_VPID_TYPE_WB | \ + IA32_VMX_EPT_VPID_INVEPT | \ + IA32_VMX_EPT_VPID_INVEPT_SINGLE | \ + IA32_VMX_EPT_VPID_INVEPT_ALL | \ + IA32_VMX_EPT_VPID_INVVPID | \ + IA32_VMX_EPT_VPID_INVVPID_SINGLE) + #define HANDLED 1 #define UNHANDLED 0 @@ -448,7 +462,7 @@ vmx_restore(void) } static int -vmx_init(int ipinum) +vmx_init(void) { int error; uint64_t fixed0, fixed1; @@ -587,11 +601,16 @@ vmx_init(int ipinum) } } - /* Initialize EPT */ - error = ept_init(ipinum); - if (error) { - printf("vmx_init: ept initialization failed (%d)\n", error); - return (error); + /* + * Check for necessary EPT capabilities + * + * TODO: Properly handle when IA32_VMX_EPT_VPID_HW_AD is missing and the + * hypervisor intends to utilize dirty page tracking. + */ + uint64_t ept_caps = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP); + if ((ept_caps & EPT_CAPS_REQUIRED) != EPT_CAPS_REQUIRED) { + cmn_err(CE_WARN, "!Inadequate EPT capabilities: %lx", ept_caps); + return (EINVAL); } #ifdef __FreeBSD__ @@ -665,7 +684,7 @@ vmx_trigger_hostintr(int vector) } static void * -vmx_vminit(struct vm *vm, pmap_t pmap) +vmx_vminit(struct vm *vm) { uint16_t vpid[VM_MAXCPU]; int i, error, datasel; @@ -682,7 +701,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap) } vmx->vm = vm; - vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); + vmx->eptp = vmspace_table_root(vm_get_vmspace(vm)); /* * Clean up EPTP-tagged guest physical and combined mappings @@ -693,7 +712,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap) * * Combined mappings for this EP4TA are also invalidated for all VPIDs. */ - ept_invalidate_mappings(vmx->eptp); + hma_vmx_invept_allcpus((uintptr_t)vmx->eptp); vmx_msr_bitmap_initialize(vmx); @@ -805,8 +824,8 @@ vmx_vminit(struct vm *vm, pmap_t pmap) vmcs_write(VMCS_VPID, vpid[i]); if (guest_l1d_flush && !guest_l1d_flush_sw) { - vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( - (vm_offset_t)&msr_load_list[0])); + vmcs_write(VMCS_ENTRY_MSR_LOAD, + vtophys(&msr_load_list[0])); vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, nitems(msr_load_list)); vmcs_write(VMCS_EXIT_MSR_STORE, 0); @@ -860,9 +879,6 @@ vmx_vminit(struct vm *vm, pmap_t pmap) vmx->state[i].nextrip = ~0; vmx->state[i].lastcpu = NOCPU; vmx->state[i].vpid = vpid[i]; - - - vmx->ctx[i].pmap = pmap; } return (vmx); @@ -929,14 +945,16 @@ invvpid(uint64_t type, struct invvpid_desc desc) * Invalidate guest mappings identified by its vpid from the TLB. */ static __inline void -vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) +vmx_invvpid(struct vmx *vmx, int vcpu, int running) { struct vmxstate *vmxstate; struct invvpid_desc invvpid_desc; + struct vmspace *vms; vmxstate = &vmx->state[vcpu]; if (vmxstate->vpid == 0) return; + vms = vm_get_vmspace(vmx->vm); if (!running) { /* @@ -964,7 +982,7 @@ vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) * Note also that this will invalidate mappings tagged with 'vpid' * for "all" EP4TAs. */ - if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { + if (vmspace_table_gen(vms) == vmx->eptgen[curcpu]) { invvpid_desc._res1 = 0; invvpid_desc._res2 = 0; invvpid_desc.vpid = vmxstate->vpid; @@ -982,8 +1000,28 @@ vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) } } +static __inline void +invept(uint64_t type, uint64_t eptp) +{ + int error; + struct invept_desc { + uint64_t eptp; + uint64_t _resv; + } desc = { eptp, 0 }; + + __asm __volatile("invept %[desc], %[type];" + VMX_SET_ERROR_CODE_ASM + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error != 0) { + panic("invvpid error %d", error); + } +} + static void -vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) { struct vmxstate *vmxstate; @@ -1014,7 +1052,7 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); - vmx_invvpid(vmx, vcpu, pmap, 1); + vmx_invvpid(vmx, vcpu, 1); } /* @@ -1582,7 +1620,7 @@ vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) const uint64_t diff = crval ^ old; /* Flush the TLB if the paging or write-protect bits are changing */ if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) { - vmx_invvpid(vmx, vcpu, vmx->ctx[vcpu].pmap, 1); + vmx_invvpid(vmx, vcpu, 1); } vmcs_write(VMCS_GUEST_CR0, crval); @@ -2558,24 +2596,18 @@ vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) * clear NMI blocking. */ static __inline void -vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +vmx_exit_handle_possible_nmi(struct vm_exit *vmexit) { - uint32_t intr_info; - - KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); + ASSERT(!interrupts_enabled()); - if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) - return; - - intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); - KASSERT((intr_info & VMCS_INTR_VALID) != 0, - ("VM exit interruption info invalid: %x", intr_info)); + if (vmexit->u.vmx.exit_reason == EXIT_REASON_EXCEPTION) { + uint32_t intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + ASSERT(intr_info & VMCS_INTR_VALID); - if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { - KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " - "to NMI has invalid vector: %x", intr_info)); - VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); - vmm_call_trap(T_NMIFLT); + if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { + ASSERT3U(intr_info & 0xff, ==, IDT_NMI); + vmm_call_trap(T_NMIFLT); + } } } @@ -2647,7 +2679,7 @@ vmx_dr_leave_guest(struct vmxctx *vmxctx) } static int -vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap) +vmx_run(void *arg, int vcpu, uint64_t rip) { int rc, handled, launched; struct vmx *vmx; @@ -2658,6 +2690,7 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap) struct vlapic *vlapic; uint32_t exit_reason; bool tpr_shadow_active; + vm_client_t *vmc; vmx = arg; vm = vmx->vm; @@ -2665,14 +2698,12 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap) vmxctx = &vmx->ctx[vcpu]; vlapic = vm_lapic(vm, vcpu); vmexit = vm_exitinfo(vm, vcpu); + vmc = vm_get_vmclient(vm, vcpu); launched = 0; tpr_shadow_active = vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW) && !vmx_cap_en(vmx, VMX_CAP_APICV) && (vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0; - KASSERT(vmxctx->pmap == pmap, - ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); - vmx_msr_guest_enter(vmx, vcpu); vmcs_load(vmcs_pa); @@ -2691,9 +2722,10 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap) vmcs_write(VMCS_HOST_CR3, rcr3()); vmcs_write(VMCS_GUEST_RIP, rip); - vmx_set_pcpu_defaults(vmx, vcpu, pmap); + vmx_set_pcpu_defaults(vmx, vcpu); do { enum event_inject_state inject_state; + uint64_t eptgen; KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " "%lx/%lx", __func__, vmcs_guest_rip(), rip)); @@ -2721,8 +2753,8 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap) * because interrupts are disabled. The pending interrupt will * be recognized as soon as the guest state is loaded. * - * The same reasoning applies to the IPI generated by - * pmap_invalidate_ept(). + * The same reasoning applies to the IPI generated by vmspace + * invalidation. */ disable_intr(); @@ -2804,10 +2836,28 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap) vmx_tpr_shadow_enter(vlapic); } + /* + * Indicate activation of vmspace (EPT) table just prior to VMX + * entry, checking for the necessity of an invept invalidation. + */ + eptgen = vmc_table_enter(vmc); + if (vmx->eptgen[vcpu] != eptgen) { + /* + * VMspace generate does not match what was previously + * used for this CPU so all mappings associated with + * this EPTP must be invalidated. + */ + invept(1, vmx->eptp); + vmx->eptgen[vcpu] = eptgen; + } + vmx_run_trace(vmx, vcpu); vcpu_ustate_change(vm, vcpu, VU_RUN); vmx_dr_enter_guest(vmxctx); + + /* Perform VMX entry */ rc = vmx_enter_guest(vmxctx, vmx, launched); + vmx_dr_leave_guest(vmxctx); vcpu_ustate_change(vm, vcpu, VU_EMU_KERN); @@ -2823,16 +2873,18 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap) vmexit->inst_length = vmexit_instruction_length(); vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); - /* Update 'nextrip' */ vmx->state[vcpu].nextrip = rip; if (rc == VMX_GUEST_VMEXIT) { - vmx_exit_handle_nmi(vmx, vcpu, vmexit); - enable_intr(); + vmx_exit_handle_possible_nmi(vmexit); + } + enable_intr(); + vmc_table_exit(vmc); + + if (rc == VMX_GUEST_VMEXIT) { handled = vmx_exit_process(vmx, vcpu, vmexit); } else { - enable_intr(); vmx_exit_inst_error(vmxctx, rc, vmexit); } DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, rip, @@ -3077,7 +3129,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) * XXX the processor retains global mappings when %cr3 * is updated but vmx_invvpid() does not. */ - vmx_invvpid(vmx, vcpu, vmx->ctx[vcpu].pmap, running); + vmx_invvpid(vmx, vcpu, running); break; case VMCS_INVALID_ENCODING: error = EINVAL; @@ -3647,6 +3699,7 @@ struct vmm_ops vmm_ops_intel = { .init = vmx_init, .cleanup = vmx_cleanup, .resume = vmx_restore, + .vminit = vmx_vminit, .vmrun = vmx_run, .vmcleanup = vmx_vmcleanup, @@ -3656,8 +3709,6 @@ struct vmm_ops vmm_ops_intel = { .vmsetdesc = vmx_setdesc, .vmgetcap = vmx_getcap, .vmsetcap = vmx_setcap, - .vmspace_alloc = ept_vmspace_alloc, - .vmspace_free = ept_vmspace_free, .vlapic_init = vmx_vlapic_init, .vlapic_cleanup = vmx_vlapic_cleanup, diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h index c0d1fdd7fb..8ca7d993f7 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h @@ -39,7 +39,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2018 Joyent, Inc. - * Copyright 2020 Oxide Computer Company + * Copyright 2021 Oxide Computer Company */ #ifndef _VMX_H_ @@ -47,8 +47,6 @@ #include "vmcs.h" -struct pmap; - struct vmxctx { uint64_t guest_rdi; /* Guest state */ uint64_t guest_rsi; @@ -82,12 +80,6 @@ struct vmxctx { int host_tf; int inst_fail_status; - - /* - * The pmap needs to be deactivated in vmx_enter_guest() - * so keep a copy of the 'pmap' in each vmxctx. - */ - struct pmap *pmap; }; struct vmxcap { @@ -151,7 +143,7 @@ struct vmx { uint64_t eptp; enum vmx_caps vmx_caps; struct vm *vm; - long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ + uint64_t eptgen[MAXCPU]; /* cached vmspace generation */ }; CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s index aba844e8c3..60f761d652 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s @@ -151,35 +151,7 @@ ENTRY_NP(vmx_enter_guest) movq %rdi, %r12 /* vmxctx */ movq %rsi, %r13 /* vmx */ movl %edx, %r14d /* launch state */ - movq VMXCTX_PMAP(%rdi), %rbx - /* Activate guest pmap on this cpu. */ - leaq PM_ACTIVE(%rbx), %rdi - movl %gs:CPU_ID, %esi - call cpuset_atomic_add - movq %r12, %rdi - - /* - * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen' - * then we must invalidate all mappings associated with this EPTP. - */ - movq PM_EPTGEN(%rbx), %r10 - movl %gs:CPU_ID, %eax - cmpq %r10, VMX_EPTGEN(%r13, %rax, 8) - je guest_restore - - /* Refresh 'vmx->eptgen[curcpu]' */ - movq %r10, VMX_EPTGEN(%r13, %rax, 8) - - /* Setup the invept descriptor on the host stack */ - pushq $0x0 - pushq VMX_EPTP(%r13) - movl $0x1, %eax /* Single context invalidate */ - invept (%rsp), %rax - leaq 0x10(%rsp), %rsp - jbe invept_error /* Check invept instruction error */ - -guest_restore: /* Write the current %rsp into the VMCS to be restored on vmexit */ movl $VMCS_HOST_RSP, %eax vmwrite %rsp, %rax @@ -217,9 +189,6 @@ do_launch: vmwrite_error: movl $VMX_VMWRITE_ERROR, %eax jmp decode_inst_error -invept_error: - movl $VMX_INVEPT_ERROR, %eax - jmp decode_inst_error decode_inst_error: movl $VM_FAIL_VALID, %r11d jz inst_error @@ -227,13 +196,6 @@ decode_inst_error: inst_error: movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi) - movq VMXCTX_PMAP(%rdi), %rdi - leaq PM_ACTIVE(%rdi), %rdi - movl %gs:CPU_ID, %esi - movq %rax, %r12 - call cpuset_atomic_del - movq %r12, %rax - movq VMXSTK_RBX(%rsp), %rbx movq VMXSTK_R12(%rsp), %r12 movq VMXSTK_R13(%rsp), %r13 @@ -256,12 +218,6 @@ ALTENTRY(vmx_exit_guest) /* Save guest state that is not automatically saved in the vmcs. */ VMX_GUEST_SAVE - /* Deactivate guest pmap on this cpu. */ - movq VMXCTX_PMAP(%rdi), %rdi - leaq PM_ACTIVE(%rdi), %rdi - movl %gs:CPU_ID, %esi - call cpuset_atomic_del - /* * This will return to the caller of 'vmx_enter_guest()' with a return * value of VMX_GUEST_VMEXIT. @@ -287,12 +243,6 @@ ALTENTRY(vmx_exit_guest_flush_rsb) /* Save guest state that is not automatically saved in the vmcs. */ VMX_GUEST_SAVE - /* Deactivate guest pmap on this cpu. */ - movq VMXCTX_PMAP(%rdi), %rdi - leaq PM_ACTIVE(%rdi), %rdi - movl %gs:CPU_ID, %esi - call cpuset_atomic_del - VMX_GUEST_FLUSH_SCRATCH /* diff --git a/usr/src/uts/i86pc/io/vmm/intel/vtd.c b/usr/src/uts/i86pc/io/vmm/intel/vtd.c index 8784c94b48..a3773b54f0 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vtd.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vtd.c @@ -254,7 +254,7 @@ vtd_wbflush(struct vtdmap *vtdmap) { if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) - pmap_invalidate_cache(); + invalidate_cache_all(); if (VTD_CAP_RWBF(vtdmap->cap)) { vtdmap->gcr = VTD_GCR_WBF; diff --git a/usr/src/uts/i86pc/io/vmm/io/iommu.c b/usr/src/uts/i86pc/io/vmm/io/iommu.c index 3630c36680..8fec022977 100644 --- a/usr/src/uts/i86pc/io/vmm/io/iommu.c +++ b/usr/src/uts/i86pc/io/vmm/io/iommu.c @@ -48,7 +48,6 @@ __FBSDID("$FreeBSD$"); #include <sys/pci.h> #include "vmm_util.h" -#include "vmm_mem.h" #include "iommu.h" static int iommu_avail; @@ -191,6 +190,12 @@ iommu_find_device(dev_info_t *dip, void *arg) return (DDI_WALK_CONTINUE); } + +static vm_paddr_t +vmm_mem_maxaddr(void) +{ + return (ptoa(physmax + 1)); +} #endif static void diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.c b/usr/src/uts/i86pc/io/vmm/io/ppt.c index 8f3a276a93..96cc728a74 100644 --- a/usr/src/uts/i86pc/io/vmm/io/ppt.c +++ b/usr/src/uts/i86pc/io/vmm/io/ppt.c @@ -42,7 +42,6 @@ __FBSDID("$FreeBSD$"); #include <sys/module.h> #include <sys/bus.h> #include <sys/pciio.h> -#include <sys/smp.h> #include <sys/sysctl.h> #include <dev/pci/pcivar.h> diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c index 8198ebfce6..06ee46c8e2 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c @@ -52,13 +52,12 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/mutex.h> #include <sys/systm.h> -#include <sys/smp.h> +#include <sys/cpuset.h> #include <x86/specialreg.h> #include <x86/apicreg.h> #include <machine/clock.h> -#include <machine/smp.h> #include <machine/vmm.h> @@ -1602,7 +1601,7 @@ vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, } void -vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) +vlapic_post_intr(struct vlapic *vlapic, int hostcpu) { /* * Post an interrupt to the vcpu currently running on 'hostcpu'. @@ -1616,7 +1615,7 @@ vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) if (vlapic->ops.post_intr) (*vlapic->ops.post_intr)(vlapic, hostcpu); else - ipi_cpu(hostcpu, ipinum); + poke_cpu(hostcpu); } bool diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h index f490eff637..a46bae9d34 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.h +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h @@ -72,9 +72,9 @@ vcpu_notify_t vlapic_set_intr_ready(struct vlapic *vlapic, int vector, /* * Post an interrupt to the vcpu running on 'hostcpu'. This will use a * hardware assist if available (e.g. Posted Interrupt) or fall back to - * sending an 'ipinum' to interrupt the 'hostcpu'. + * sending an IPI to interrupt the 'hostcpu'. */ -void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); +void vlapic_post_intr(struct vlapic *vlapic, int hostcpu); void vlapic_fire_cmci(struct vlapic *vlapic); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); diff --git a/usr/src/uts/i86pc/io/vmm/seg_vmm.c b/usr/src/uts/i86pc/io/vmm/seg_vmm.c index 23a8da3bc5..863b283418 100644 --- a/usr/src/uts/i86pc/io/vmm/seg_vmm.c +++ b/usr/src/uts/i86pc/io/vmm/seg_vmm.c @@ -46,8 +46,9 @@ typedef struct segvmm_data { krwlock_t svmd_lock; - vm_object_t svmd_obj; - uintptr_t svmd_obj_off; + vm_object_t *svmd_vmo; + vm_client_t *svmd_vmc; + uintptr_t svmd_off; uchar_t svmd_prot; size_t svmd_softlockcnt; } segvmm_data_t; @@ -104,9 +105,41 @@ static struct seg_ops segvmm_ops = { .inherit = seg_inherit_notsup }; +/* + * Unload a region from the HAT for A/D tracking. + */ +static void +segvmm_invalidate(void *arg, uintptr_t gpa, size_t sz) +{ + struct seg *seg = arg; + segvmm_data_t *svmd = seg->s_data; + + /* + * Invalidations are only necessary (and configured) for vmspace + * mappings. Direct vm_object mappings are not involved. + */ + ASSERT3P(svmd->svmd_vmo, ==, NULL); + + /* + * The region being invalidated may overlap with all, some, or none of + * this segment. We are only concerned about that overlap. + */ + const uintptr_t start = MAX(gpa, svmd->svmd_off); + const uintptr_t end = MIN(gpa + sz, svmd->svmd_off + seg->s_size); + if (start >= end) { + return; + } + ASSERT(start >= svmd->svmd_off && end <= svmd->svmd_off + seg->s_size); + ASSERT(start >= gpa && end <= gpa + sz); + const caddr_t unload_va = seg->s_base + (start - svmd->svmd_off); + const size_t unload_sz = (end - start); + ASSERT3U(unload_sz, <=, seg->s_size); + + hat_unload(seg->s_as->a_hat, unload_va, unload_sz, HAT_UNLOAD); +} /* - * Create a kernel/user-mapped segment. ->kaddr is the segkvmm mapping. + * Create a VMM-memory-backed segment. */ int segvmm_create(struct seg **segpp, void *argsp) @@ -115,17 +148,35 @@ segvmm_create(struct seg **segpp, void *argsp) segvmm_crargs_t *cra = argsp; segvmm_data_t *data; + VERIFY((cra->vmo == NULL && cra->vmc != NULL) || + (cra->vmo != NULL && cra->vmc == NULL)); + VERIFY(cra->prot & PROT_USER); + VERIFY0(cra->offset & PAGEOFFSET); + data = kmem_zalloc(sizeof (*data), KM_SLEEP); rw_init(&data->svmd_lock, NULL, RW_DEFAULT, NULL); - data->svmd_obj = cra->obj; - data->svmd_obj_off = cra->offset; - data->svmd_prot = cra->prot; - - /* Grab a hold on the VM object for the duration of this seg mapping */ - vm_object_reference(data->svmd_obj); + data->svmd_off = cra->offset; + data->svmd_prot = cra->prot & ~PROT_USER; seg->s_ops = &segvmm_ops; seg->s_data = data; + + if (cra->vmo != NULL) { + data->svmd_vmo = cra->vmo; + /* Grab a hold on the VM object for the lifetime of segment */ + vm_object_reference(data->svmd_vmo); + } else { + int err; + + data->svmd_vmc = cra->vmc; + err = vmc_set_inval_cb(data->svmd_vmc, segvmm_invalidate, seg); + if (err != 0) { + seg->s_ops = NULL; + seg->s_data = NULL; + kmem_free(data, sizeof (*data)); + return (err); + } + } return (0); } @@ -139,15 +190,34 @@ segvmm_dup(struct seg *seg, struct seg *newseg) newsvmd = kmem_zalloc(sizeof (segvmm_data_t), KM_SLEEP); rw_init(&newsvmd->svmd_lock, NULL, RW_DEFAULT, NULL); - newsvmd->svmd_obj = svmd->svmd_obj; - newsvmd->svmd_obj_off = svmd->svmd_obj_off; + newsvmd->svmd_off = svmd->svmd_off; newsvmd->svmd_prot = svmd->svmd_prot; - /* Grab another hold for the duplicate segment */ - vm_object_reference(svmd->svmd_obj); - newseg->s_ops = seg->s_ops; newseg->s_data = newsvmd; + + if (svmd->svmd_vmo != NULL) { + /* Grab another hold for the duplicate segment */ + vm_object_reference(svmd->svmd_vmo); + newsvmd->svmd_vmo = svmd->svmd_vmo; + } else { + int err; + + newsvmd->svmd_vmc = vmc_clone(svmd->svmd_vmc); + /* + * The cloned client does not inherit the invalidation + * configuration, so attempt to set it here for the new segment. + */ + err = vmc_set_inval_cb(newsvmd->svmd_vmc, segvmm_invalidate, + newseg); + if (err != 0) { + newseg->s_ops = NULL; + newseg->s_data = NULL; + kmem_free(newsvmd, sizeof (*newsvmd)); + return (err); + } + } + return (0); } @@ -169,9 +239,6 @@ segvmm_unmap(struct seg *seg, caddr_t addr, size_t len) /* Unconditionally unload the entire segment range. */ hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP); - /* Release the VM object hold this segment possessed */ - vm_object_deallocate(svmd->svmd_obj); - seg_free(seg); return (0); } @@ -179,35 +246,93 @@ segvmm_unmap(struct seg *seg, caddr_t addr, size_t len) static void segvmm_free(struct seg *seg) { - segvmm_data_t *data = seg->s_data; + segvmm_data_t *svmd = seg->s_data; - ASSERT(data != NULL); + ASSERT(svmd != NULL); - rw_destroy(&data->svmd_lock); - VERIFY(data->svmd_softlockcnt == 0); - kmem_free(data, sizeof (*data)); + if (svmd->svmd_vmo != NULL) { + /* Release the VM object hold this segment possessed */ + vm_object_release(svmd->svmd_vmo); + svmd->svmd_vmo = NULL; + } else { + vmc_destroy(svmd->svmd_vmc); + svmd->svmd_vmc = NULL; + } + rw_destroy(&svmd->svmd_lock); + VERIFY(svmd->svmd_softlockcnt == 0); + kmem_free(svmd, sizeof (*svmd)); seg->s_data = NULL; } static int -segvmm_fault_in(struct hat *hat, struct seg *seg, uintptr_t va, size_t len) +segvmm_fault_obj(struct hat *hat, struct seg *seg, uintptr_t va, size_t len) { segvmm_data_t *svmd = seg->s_data; const uintptr_t end = va + len; - const uintptr_t prot = svmd->svmd_prot; + const int prot = svmd->svmd_prot; + const int uprot = prot | PROT_USER; + vm_object_t *vmo = svmd->svmd_vmo; + + ASSERT(vmo != NULL); va &= PAGEMASK; - uintptr_t off = va - (uintptr_t)seg->s_base; + uintptr_t off = va - (uintptr_t)seg->s_base + svmd->svmd_off; do { pfn_t pfn; - pfn = vm_object_pfn(svmd->svmd_obj, off); + pfn = vm_object_pfn(vmo, off); if (pfn == PFN_INVALID) { - return (-1); + return (FC_NOMAP); + } + + /* Ignore any large-page possibilities for now */ + hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, uprot, HAT_LOAD); + va += PAGESIZE; + off += PAGESIZE; + } while (va < end); + + return (0); +} + +static int +segvmm_fault_space(struct hat *hat, struct seg *seg, uintptr_t va, size_t len) +{ + segvmm_data_t *svmd = seg->s_data; + const uintptr_t end = va + len; + const int prot = svmd->svmd_prot; + const int uprot = prot | PROT_USER; + vm_client_t *vmc = svmd->svmd_vmc; + + ASSERT(vmc != NULL); + + va &= PAGEMASK; + uintptr_t off = va - (uintptr_t)seg->s_base + svmd->svmd_off; + + do { + vm_page_t *vmp; + pfn_t pfn; + + vmp = vmc_hold(vmc, off, prot); + if (vmp == NULL) { + return (FC_NOMAP); } + pfn = vmp_get_pfn(vmp); + ASSERT3U(pfn, !=, PFN_INVALID); + /* Ignore any large-page possibilities for now */ - hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, prot, HAT_LOAD); + hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, uprot, HAT_LOAD); + + if (vmp_release(vmp)) { + /* + * Region was unmapped from vmspace while we were + * loading it into this AS. Communicate it as if it + * were a fault. + */ + hat_unload(hat, (caddr_t)va, PAGESIZE, HAT_UNLOAD); + return (FC_NOMAP); + } + va += PAGESIZE; off += PAGESIZE; } while (va < end); @@ -218,7 +343,7 @@ segvmm_fault_in(struct hat *hat, struct seg *seg, uintptr_t va, size_t len) /* ARGSUSED */ static faultcode_t segvmm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, - enum fault_type type, enum seg_rw tw) + enum fault_type type, enum seg_rw rw) { segvmm_data_t *svmd = seg->s_data; int err = 0; @@ -244,7 +369,11 @@ segvmm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, VERIFY(type == F_INVAL || type == F_SOFTLOCK); rw_enter(&svmd->svmd_lock, RW_WRITER); - err = segvmm_fault_in(hat, seg, (uintptr_t)addr, len); + if (svmd->svmd_vmo != NULL) { + err = segvmm_fault_obj(hat, seg, (uintptr_t)addr, len); + } else { + err = segvmm_fault_space(hat, seg, (uintptr_t)addr, len); + } if (type == F_SOFTLOCK && err == 0) { size_t nval = svmd->svmd_softlockcnt + btop(len); @@ -426,8 +555,8 @@ segvmm_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) { segvmm_data_t *svmd = seg->s_data; - memidp->val[0] = (uintptr_t)svmd->svmd_obj; - memidp->val[1] = (uintptr_t)(addr - seg->s_base) + svmd->svmd_obj_off; + memidp->val[0] = (uintptr_t)svmd->svmd_vmo; + memidp->val[1] = (uintptr_t)(addr - seg->s_base) + svmd->svmd_off; return (0); } diff --git a/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h b/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h index a4f72f816e..5ba0dad5c3 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h +++ b/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h @@ -21,8 +21,9 @@ typedef struct segvmm_crargs { uchar_t prot; /* protection */ - vm_object_t obj; uintptr_t offset; + vm_object_t *vmo; + vm_client_t *vmc; } segvmm_crargs_t; int segvmm_create(struct seg **, void *); diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h index 554f51bbb6..a425fb53ec 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h @@ -19,20 +19,6 @@ #include <sys/types.h> -typedef struct vmm_pt_ops vmm_pt_ops_t; -struct vmm_pt_ops { - void * (*vpo_init)(uint64_t *); - void (*vpo_free)(void *); - uint64_t (*vpo_wired_cnt)(void *); - int (*vpo_is_wired)(void *, uint64_t, uint_t *); - int (*vpo_map)(void *, uint64_t, pfn_t, uint_t, uint_t, - uint8_t); - uint64_t (*vpo_unmap)(void *, uint64_t, uint64_t); -}; - -extern struct vmm_pt_ops ept_ops; -extern struct vmm_pt_ops rvi_ops; - /* * Constants for the nodes in the GPT radix tree. Note * that, in accordance with hardware page table descriptions, @@ -64,6 +50,8 @@ enum vmm_gpt_node_level { * vpeo_reset_accessed: Resets the accessed bit on the given PTE. If the * second argument is `true`, the bit will be set, otherwise it will be * cleared. Returns non-zero if the previous value of the bit was set. + * vpeo_get_pmtp: Generate a properly formatted PML4 (EPTP/nCR3), given the root + * PFN for the GPT. */ typedef struct vmm_pte_ops vmm_pte_ops_t; struct vmm_pte_ops { @@ -74,30 +62,29 @@ struct vmm_pte_ops { uint_t (*vpeo_pte_prot)(uint64_t); uint_t (*vpeo_reset_dirty)(uint64_t *, bool); uint_t (*vpeo_reset_accessed)(uint64_t *, bool); + uint64_t (*vpeo_get_pmtp)(pfn_t); }; +extern vmm_pte_ops_t ept_pte_ops; +extern vmm_pte_ops_t rvi_pte_ops; + struct vmm_gpt; typedef struct vmm_gpt vmm_gpt_t; -vmm_gpt_t *ept_create(void); -vmm_gpt_t *rvi_create(void); - vmm_gpt_t *vmm_gpt_alloc(vmm_pte_ops_t *); void vmm_gpt_free(vmm_gpt_t *); -void *vmm_gpt_root_kaddr(vmm_gpt_t *); -pfn_t vmm_gpt_root_pfn(vmm_gpt_t *); uint64_t *vmm_gpt_lookup(vmm_gpt_t *, uint64_t); void vmm_gpt_walk(vmm_gpt_t *, uint64_t, uint64_t **, enum vmm_gpt_node_level); -void vmm_gpt_populate_entry(vmm_gpt_t *, uint64_t); void vmm_gpt_populate_region(vmm_gpt_t *, uint64_t, uint64_t); +bool vmm_gpt_map_at(vmm_gpt_t *, uint64_t *, pfn_t, uint_t, uint8_t); void vmm_gpt_vacate_region(vmm_gpt_t *, uint64_t, uint64_t); bool vmm_gpt_map(vmm_gpt_t *, uint64_t, pfn_t, uint_t, uint8_t); bool vmm_gpt_unmap(vmm_gpt_t *, uint64_t); size_t vmm_gpt_unmap_region(vmm_gpt_t *, uint64_t, uint64_t); +uint64_t vmm_gpt_get_pmtp(vmm_gpt_t *); -bool vmm_gpt_is_mapped(vmm_gpt_t *, uint64_t, uint_t *); -size_t vmm_gpt_mapped_count(vmm_gpt_t *); +bool vmm_gpt_is_mapped(vmm_gpt_t *, uint64_t *, pfn_t *, uint_t *); uint_t vmm_gpt_reset_accessed(vmm_gpt_t *, uint64_t *, bool); uint_t vmm_gpt_reset_dirty(vmm_gpt_t *, uint64_t *, bool); diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h index 3a50dafd6d..5f0ba4b875 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h @@ -48,6 +48,7 @@ #include <sys/sdt.h> #include <x86/segments.h> +#include <sys/vmm.h> SDT_PROVIDER_DECLARE(vmm); @@ -61,16 +62,15 @@ struct vhpet; struct vioapic; struct vlapic; struct vmspace; +struct vm_client; struct vm_object; struct vm_guest_paging; -struct pmap; -typedef int (*vmm_init_func_t)(int ipinum); +typedef int (*vmm_init_func_t)(void); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); -typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); -typedef int (*vmi_run_func_t)(void *vmi, int vcpu, uint64_t rip, - struct pmap *pmap); +typedef void * (*vmi_init_func_t)(struct vm *vm); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, uint64_t rip); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); @@ -82,8 +82,6 @@ typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, const struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); -typedef struct vmspace *(*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); -typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic *(*vmi_vlapic_init)(void *vmi, int vcpu); typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); typedef void (*vmi_savectx)(void *vmi, int vcpu); @@ -103,8 +101,6 @@ struct vmm_ops { vmi_set_desc_t vmsetdesc; vmi_get_cap_t vmgetcap; vmi_set_cap_t vmsetcap; - vmi_vmspace_alloc vmspace_alloc; - vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; @@ -148,9 +144,6 @@ int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, struct vm_object **objptr); vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); -void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, - int prot, void **cookie); -void vm_gpa_release(void *cookie); bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); @@ -261,6 +254,7 @@ void *vcpu_stats(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid); void vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t); struct vmspace *vm_get_vmspace(struct vm *vm); +struct vm_client *vm_get_vmclient(struct vm *vm, int vcpuid); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); @@ -312,6 +306,7 @@ enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; + int prot; void *hva; void *cookie; }; @@ -332,9 +327,9 @@ struct vm_copyinfo { */ int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, - int num_copyinfo, int *is_fault); + uint_t num_copyinfo, int *is_fault); void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, - int num_copyinfo); + uint_t num_copyinfo); void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h index 76d5fec8b7..a01b909ff6 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h @@ -18,40 +18,64 @@ #ifndef _VMM_VM_H #define _VMM_VM_H -#include <sys/list.h> #include <sys/types.h> -#include <vm/hat_pte.h> -#include <machine/pmap.h> -/* - * vm_map_wire and vm_map_unwire option flags - */ -#define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */ -#define VM_MAP_WIRE_USER 1 /* wiring in a user map */ - -#define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */ -#define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */ - -#define VM_MAP_WIRE_WRITE 4 /* Validate writable. */ - -/* - * The following "find_space" options are supported by vm_map_find(). - * - * For VMFS_ALIGNED_SPACE, the desired alignment is specified to - * the macro argument as log base 2 of the desired alignment. - */ -#define VMFS_NO_SPACE 0 /* don't find; use the given range */ -#define VMFS_ANY_SPACE 1 /* find range with any alignment */ -#define VMFS_OPTIMAL_SPACE 2 /* find range with optimal alignment */ -#define VMFS_SUPER_SPACE 3 /* find superpage-aligned range */ -#define VMFS_ALIGNED_SPACE(x) ((x) << 8) /* find range with fixed alignment */ +typedef struct vmspace vmspace_t; +typedef struct vm_client vm_client_t; +typedef struct vm_page vm_page_t; +typedef struct vm_object vm_object_t; + +struct vmm_pte_ops; + +typedef void (*vmc_inval_cb_t)(void *, uintptr_t, size_t); + +/* vmspace_t operations */ +vmspace_t *vmspace_alloc(size_t, struct vmm_pte_ops *, bool); +void vmspace_destroy(vmspace_t *); +int vmspace_map(vmspace_t *, vm_object_t *, uintptr_t, uintptr_t, size_t, + uint8_t); +int vmspace_unmap(vmspace_t *, uintptr_t, uintptr_t); +int vmspace_populate(vmspace_t *, uintptr_t, uintptr_t); +vm_client_t *vmspace_client_alloc(vmspace_t *); +uint64_t vmspace_table_root(vmspace_t *); +uint64_t vmspace_table_gen(vmspace_t *); +uint64_t vmspace_resident_count(vmspace_t *); + +/* vm_client_t operations */ +vm_page_t *vmc_hold(vm_client_t *, uintptr_t, int); +uint64_t vmc_table_enter(vm_client_t *); +void vmc_table_exit(vm_client_t *); +int vmc_fault(vm_client_t *, uintptr_t, int); +vm_client_t *vmc_clone(vm_client_t *); +int vmc_set_inval_cb(vm_client_t *, vmc_inval_cb_t, void *); +void vmc_destroy(vm_client_t *); + +/* vm_object_t operations */ +vm_object_t *vm_object_mem_allocate(size_t, bool); +vm_object_t *vmm_mmio_alloc(vmspace_t *, uintptr_t, size_t, uintptr_t); +void vm_object_reference(vm_object_t *); +void vm_object_release(vm_object_t *); +pfn_t vm_object_pfn(vm_object_t *, uintptr_t); + +/* vm_page_t operations */ +const void *vmp_get_readable(const vm_page_t *); +void *vmp_get_writable(const vm_page_t *); +pfn_t vmp_get_pfn(const vm_page_t *); +void vmp_chain(vm_page_t *, vm_page_t *); +vm_page_t *vmp_next(const vm_page_t *); +bool vmp_release(vm_page_t *); +bool vmp_release_chain(vm_page_t *); + +/* seg_vmm mapping */ +struct vm; +int vm_segmap_obj(struct vm *, int, off_t, off_t, struct as *, caddr_t *, + uint_t, uint_t, uint_t); +int vm_segmap_space(struct vm *, off_t, struct as *, caddr_t *, off_t, uint_t, + uint_t, uint_t); -/* - * vm_fault option flags - */ -#define VM_FAULT_NORMAL 0 /* Nothing special */ -#define VM_FAULT_WIRE 1 /* Wire the mapped page */ -#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/PROT_COPY */ +/* Glue functions */ +vm_paddr_t vtophys(void *); +void invalidate_cache_all(void); /* * The VM_MAXUSER_ADDRESS determines the upper size limit of a vmspace. @@ -61,131 +85,4 @@ */ #define VM_MAXUSER_ADDRESS 0x00003ffffffffffful -/* - * Type definitions used in the hypervisor. - */ -typedef uchar_t vm_prot_t; - -/* New type declarations. */ -struct vm; -struct vmspace; -struct pmap; - -struct vm_object; -typedef struct vm_object *vm_object_t; - -struct vmm_pt_ops; - -struct vm_page; -typedef struct vm_page *vm_page_t; - -enum obj_type { OBJT_DEFAULT, OBJT_SWAP, OBJT_VNODE, OBJT_DEVICE, OBJT_PHYS, - OBJT_DEAD, OBJT_SG, OBJT_MGTDEVICE }; -typedef uchar_t objtype_t; - -union vm_map_object; -typedef union vm_map_object vm_map_object_t; - -struct vm_map_entry; -typedef struct vm_map_entry *vm_map_entry_t; - -struct vm_map; -typedef struct vm_map *vm_map_t; - -pmap_t vmspace_pmap(struct vmspace *); - -int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, - vm_offset_t, int, vm_prot_t, vm_prot_t, int); -int vm_map_remove(vm_map_t, vm_offset_t, vm_offset_t); -int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); - -long vmspace_resident_count(struct vmspace *vmspace); - -void pmap_invalidate_cache(void); -void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); -int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); -long pmap_wired_count(pmap_t pmap); - -struct vm_map { - struct vmspace *vmm_space; -}; - -struct pmap { - void *pm_pml4; - cpuset_t pm_active; - long pm_eptgen; - - /* Implementation private */ - enum pmap_type pm_type; - struct vmm_pt_ops *pm_ops; - void *pm_impl; -}; - -struct vmspace { - struct vm_map vm_map; - - /* Implementation private */ - kmutex_t vms_lock; - boolean_t vms_map_changing; - struct pmap vms_pmap; - uintptr_t vms_size; /* fixed after creation */ - - list_t vms_maplist; -}; - -typedef pfn_t (*vm_pager_fn_t)(vm_object_t, uintptr_t, pfn_t *, uint_t *); - -struct vm_object { - uint_t vmo_refcnt; /* manipulated with atomic ops */ - - /* This group of fields are fixed at creation time */ - objtype_t vmo_type; - size_t vmo_size; - vm_pager_fn_t vmo_pager; - void *vmo_data; - - kmutex_t vmo_lock; /* protects fields below */ - vm_memattr_t vmo_attr; -}; - -struct vm_page { - kmutex_t vmp_lock; - pfn_t vmp_pfn; - struct vm_object *vmp_obj_held; -}; - -/* illumos-specific functions for setup and operation */ -int vm_segmap_obj(vm_object_t, off_t, size_t, struct as *, caddr_t *, uint_t, - uint_t, uint_t); -int vm_segmap_space(struct vmspace *, off_t, struct as *, caddr_t *, off_t, - uint_t, uint_t, uint_t); -void *vmspace_find_kva(struct vmspace *, uintptr_t, size_t); - -typedef int (*pmap_pinit_t)(struct pmap *pmap); - -struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t); -void vmspace_free(struct vmspace *); - -int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); -int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, - vm_prot_t prot, vm_page_t *ma, int max_count); - -struct vm_object *vm_object_allocate(objtype_t, vm_pindex_t, bool); -void vm_object_deallocate(vm_object_t); -void vm_object_reference(vm_object_t); -int vm_object_set_memattr(vm_object_t, vm_memattr_t); -pfn_t vm_object_pfn(vm_object_t, uintptr_t); - -#define VM_OBJECT_WLOCK(vmo) mutex_enter(&(vmo)->vmo_lock) -#define VM_OBJECT_WUNLOCK(vmo) mutex_exit(&(vmo)->vmo_lock) - -#define PQ_ACTIVE 1 - -void vm_page_unwire(vm_page_t, uint8_t); - -#define VM_PAGE_TO_PHYS(page) (mmu_ptob((uintptr_t)((page)->vmp_pfn))) - -vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t, - vm_ooffset_t, void *); - #endif /* _VMM_VM_H */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c index f95e415e40..998e483ecf 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -58,12 +58,10 @@ __FBSDID("$FreeBSD$"); #include <sys/proc.h> #include <sys/rwlock.h> #include <sys/sched.h> -#include <sys/smp.h> #include <sys/systm.h> #include <sys/sunddi.h> #include <machine/pcb.h> -#include <machine/smp.h> #include <machine/md_var.h> #include <x86/psl.h> #include <x86/apicreg.h> @@ -74,11 +72,11 @@ __FBSDID("$FreeBSD$"); #include <machine/vmparam.h> #include <sys/vmm_instruction_emul.h> #include <sys/vmm_vm.h> +#include <sys/vmm_gpt.h> #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_host.h" -#include "vmm_mem.h" #include "vmm_util.h" #include "vatpic.h" #include "vatpit.h" @@ -129,6 +127,7 @@ struct vcpu { struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ struct vie *vie_ctx; /* (x) instruction emulation context */ + vm_client_t *vmclient; /* (a) VM-system client */ uint64_t tsc_offset; /* (x) offset from host TSC */ enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ @@ -145,7 +144,7 @@ struct vcpu { struct mem_seg { size_t len; bool sysmem; - struct vm_object *object; + vm_object_t *object; }; #define VM_MAX_MEMSEGS 4 @@ -219,8 +218,6 @@ static struct vmm_ops vmm_ops_null = { .vmsetdesc = (vmi_set_desc_t)nullop_panic, .vmgetcap = (vmi_get_cap_t)nullop_panic, .vmsetcap = (vmi_set_cap_t)nullop_panic, - .vmspace_alloc = (vmi_vmspace_alloc)nullop_panic, - .vmspace_free = (vmi_vmspace_free)nullop_panic, .vlapic_init = (vmi_vlapic_init)nullop_panic, .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, .vmsavectx = (vmi_savectx)nullop_panic, @@ -228,17 +225,15 @@ static struct vmm_ops vmm_ops_null = { }; static struct vmm_ops *ops = &vmm_ops_null; +static vmm_pte_ops_t *pte_ops = NULL; -#define VMM_INIT(num) ((*ops->init)(num)) +#define VMM_INIT() ((*ops->init)()) #define VMM_CLEANUP() ((*ops->cleanup)()) #define VMM_RESUME() ((*ops->resume)()) -#define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap)) -#define VMRUN(vmi, vcpu, rip, pmap) \ - ((*ops->vmrun)(vmi, vcpu, rip, pmap)) +#define VMINIT(vm) ((*ops->vminit)(vm)) +#define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) -#define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max)) -#define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace)) #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) @@ -265,9 +260,6 @@ SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, */ static int halt_detection_enabled = 1; -/* IPI vector used for vcpu notifications */ -static int vmm_ipinum; - /* Trap into hypervisor on all guest exceptions and reflect them back */ static int trace_guest_exceptions; @@ -319,6 +311,8 @@ vcpu_cleanup(struct vm *vm, int i, bool destroy) fpu_save_area_free(vcpu->guestfpu); vie_free(vcpu->vie_ctx); vcpu->vie_ctx = NULL; + vmc_destroy(vcpu->vmclient); + vcpu->vmclient = NULL; } } @@ -397,25 +391,19 @@ vm_vie_ctx(struct vm *vm, int cpuid) static int vmm_init(void) { - int error; - vmm_host_state_init(); - /* We use cpu_poke() for IPIs */ - vmm_ipinum = 0; - - error = vmm_mem_init(); - if (error) - return (error); - - if (vmm_is_intel()) + if (vmm_is_intel()) { ops = &vmm_ops_intel; - else if (vmm_is_svm()) + pte_ops = &ept_pte_ops; + } else if (vmm_is_svm()) { ops = &vmm_ops_amd; - else + pte_ops = &rvi_pte_ops; + } else { return (ENXIO); + } - return (VMM_INIT(vmm_ipinum)); + return (VMM_INIT()); } int @@ -453,7 +441,7 @@ vm_init(struct vm *vm, bool create) { int i; - vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); + vm->cookie = VMINIT(vm); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); @@ -492,6 +480,12 @@ vm_init(struct vm *vm, bool create) uint_t cores_per_package = 1; uint_t threads_per_core = 1; +/* + * Debugging tunable to enable dirty-page-tracking. + * (Remains off by default for now) + */ +bool gpt_track_dirty = false; + int vm_create(const char *name, uint64_t flags, struct vm **retvm) { @@ -508,14 +502,18 @@ vm_create(const char *name, uint64_t flags, struct vm **retvm) /* Name validation has already occurred */ VERIFY3U(strnlen(name, VM_MAX_NAMELEN), <, VM_MAX_NAMELEN); - vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); + vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); + vm->vmspace = vmspace; vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; + for (uint_t i = 0; i < VM_MAXCPU; i++) { + vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); + } vm->sockets = 1; vm->cores = cores_per_package; /* XXX backwards compatibility */ @@ -621,7 +619,7 @@ vm_cleanup(struct vm *vm, bool destroy) for (i = 0; i < VM_MAX_MEMSEGS; i++) vm_free_memseg(vm, i); - VMSPACE_FREE(vm->vmspace); + vmspace_destroy(vm->vmspace); vm->vmspace = NULL; } } @@ -681,7 +679,7 @@ vm_name(struct vm *vm) int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { - vm_object_t obj; + vm_object_t *obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); @@ -692,7 +690,7 @@ vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { - return (vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len)); + return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); } /* @@ -730,7 +728,7 @@ int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) { struct mem_seg *seg; - vm_object_t obj; + vm_object_t *obj; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); @@ -746,8 +744,7 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) return (EINVAL); } - obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT, - vm->mem_transient); + obj = vm_object_mem_allocate(len, vm->mem_transient); if (obj == NULL) return (ENOMEM); @@ -759,7 +756,7 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, - vm_object_t *objptr) + vm_object_t **objptr) { struct mem_seg *seg; @@ -786,7 +783,7 @@ vm_free_memseg(struct vm *vm, int ident) seg = &vm->mem_segs[ident]; if (seg->object != NULL) { - vm_object_deallocate(seg->object); + vm_object_release(seg->object); bzero(seg, sizeof (struct mem_seg)); } } @@ -832,18 +829,16 @@ vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, if (map == NULL) return (ENOSPC); - error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, - len, 0, VMFS_NO_SPACE, prot, prot, 0); + error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); if (error != 0) return (EFAULT); vm_object_reference(seg->object); if ((flags & VM_MEMMAP_F_WIRED) != 0) { - error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, - VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + error = vmspace_populate(vm->vmspace, gpa, gpa + len); if (error != 0) { - vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); + vmspace_unmap(vm->vmspace, gpa, gpa + len); return (EFAULT); } } @@ -917,9 +912,9 @@ vm_free_memmap(struct vm *vm, int ident) mm = &vm->mem_maps[ident]; if (mm->len) { - error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, + error = vmspace_unmap(vm->vmspace, mm->gpa, mm->gpa + mm->len); - KASSERT(error == 0, ("%s: vm_map_remove error %d", + KASSERT(error == 0, ("%s: vmspace_unmap error %d", __func__, error)); bzero(mm, sizeof (struct mem_map)); } @@ -961,12 +956,14 @@ vm_iommu_modify(struct vm *vm, bool map) struct mem_map *mm; #ifdef __FreeBSD__ void *vp, *cookie, *host_domain; -#else - void *vp, *cookie, *host_domain __unused; #endif + vm_client_t *vmc; sz = PAGE_SIZE; +#ifdef __FreeBSD__ host_domain = iommu_host_domain(); +#endif + vmc = vmspace_client_alloc(vm->vmspace); for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; @@ -991,14 +988,13 @@ vm_iommu_modify(struct vm *vm, bool map) gpa = mm->gpa; while (gpa < mm->gpa + mm->len) { - vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, PROT_WRITE, - &cookie); - KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx", - vm_name(vm), gpa)); + vm_page_t *vmp; - vm_gpa_release(cookie); + vmp = vmc_hold(vmc, gpa, PROT_WRITE); + ASSERT(vmp != NULL); + hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); + vmp_release(vmp); - hpa = DMAP_TO_PHYS((uintptr_t)vp); if (map) { iommu_create_mapping(vm->iommu, gpa, hpa, sz); #ifdef __FreeBSD__ @@ -1014,6 +1010,7 @@ vm_iommu_modify(struct vm *vm, bool map) gpa += PAGE_SIZE; } } + vmc_destroy(vmc); /* * Invalidate the cached translations associated with the domain @@ -1029,9 +1026,6 @@ vm_iommu_modify(struct vm *vm, bool map) #endif } -#define vm_iommu_unmap(vm) vm_iommu_modify((vm), false) -#define vm_iommu_map(vm) vm_iommu_modify((vm), true) - int vm_unassign_pptdev(struct vm *vm, int pptfd) { @@ -1042,7 +1036,7 @@ vm_unassign_pptdev(struct vm *vm, int pptfd) return (error); if (ppt_assigned_devices(vm) == 0) - vm_iommu_unmap(vm); + vm_iommu_modify(vm, false); return (0); } @@ -1061,71 +1055,13 @@ vm_assign_pptdev(struct vm *vm, int pptfd) vm->iommu = iommu_create_domain(maxaddr); if (vm->iommu == NULL) return (ENXIO); - vm_iommu_map(vm); + vm_iommu_modify(vm, true); } error = ppt_assign_device(vm, pptfd); return (error); } -void * -vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, - void **cookie) -{ - int i, count, pageoff; - struct mem_map *mm; - vm_page_t m; -#ifdef INVARIANTS - /* - * All vcpus are frozen by ioctls that modify the memory map - * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is - * guaranteed if at least one vcpu is in the VCPU_FROZEN state. - */ - int state; - KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d", - __func__, vcpuid)); - for (i = 0; i < vm->maxcpus; i++) { - if (vcpuid != -1 && vcpuid != i) - continue; - state = vcpu_get_state(vm, i, NULL); - KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", - __func__, state)); - } -#endif - pageoff = gpa & PAGE_MASK; - if (len > PAGE_SIZE - pageoff) - panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); - - count = 0; - for (i = 0; i < VM_MAX_MEMMAPS; i++) { - mm = &vm->mem_maps[i]; - if (mm->len == 0) { - continue; - } - if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) { - count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, - trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); - break; - } - } - - if (count == 1) { - *cookie = m; - return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); - } else { - *cookie = NULL; - return (NULL); - } -} - -void -vm_gpa_release(void *cookie) -{ - vm_page_t m = cookie; - - vm_page_unwire(m, PQ_ACTIVE); -} - int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { @@ -1478,13 +1414,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) static int vm_handle_paging(struct vm *vm, int vcpuid) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + vm_client_t *vmc = vcpu->vmclient; + struct vm_exit *vme = &vcpu->exitinfo; int rv, ftype; - struct vm_map *map; - struct vcpu *vcpu; - struct vm_exit *vme; - - vcpu = &vm->vcpu[vcpuid]; - vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); @@ -1494,26 +1427,13 @@ vm_handle_paging(struct vm *vm, int vcpuid) ftype == PROT_WRITE || ftype == PROT_EXEC, ("vm_handle_paging: invalid fault_type %d", ftype)); - if (ftype == PROT_READ || ftype == PROT_WRITE) { - rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), - vme->u.paging.gpa, ftype); - if (rv == 0) { - VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx", - ftype == PROT_READ ? "accessed" : "dirty", - vme->u.paging.gpa); - goto done; - } - } - - map = &vm->vmspace->vm_map; - rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); + rv = vmc_fault(vmc, vme->u.paging.gpa, ftype); VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != 0) return (EFAULT); -done: return (0); } @@ -2221,7 +2141,6 @@ vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) struct vcpu *vcpu; struct vm_exit *vme; bool intr_disabled; - pmap_t pmap; vm_thread_ctx_t vtc; int affinity_type = CPU_CURRENT; @@ -2230,7 +2149,6 @@ vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); - pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; @@ -2266,9 +2184,6 @@ restart: affinity_type = CPU_CURRENT; critical_enter(); - KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), - ("vm_run: absurd pm_active")); - /* Force a trip through update_sregs to reload %fs/%gs and friends */ PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); @@ -2279,7 +2194,7 @@ restart: vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; vcpu_require_state(vm, vcpuid, VCPU_RUNNING); - error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); /* @@ -3355,10 +3270,9 @@ vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { if (ntype == VCPU_NOTIFY_APIC) { - vlapic_post_intr(vcpu->vlapic, hostcpu, - vmm_ipinum); + vlapic_post_intr(vcpu->vlapic, hostcpu); } else { - ipi_cpu(hostcpu, vmm_ipinum); + poke_cpu(hostcpu); } } else { /* @@ -3427,6 +3341,12 @@ vm_get_vmspace(struct vm *vm) return (vm->vmspace); } +struct vm_client * +vm_get_vmclient(struct vm *vm, int vcpuid) +{ + return (vm->vcpu[vcpuid].vmclient); +} + int vm_apicid2vcpuid(struct vm *vm, int apicid) { @@ -3481,13 +3401,12 @@ vm_segment_name(int seg) void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, - int num_copyinfo) + uint_t num_copyinfo) { - int idx; - - for (idx = 0; idx < num_copyinfo; idx++) { - if (copyinfo[idx].cookie != NULL) - vm_gpa_release(copyinfo[idx].cookie); + for (uint_t idx = 0; idx < num_copyinfo; idx++) { + if (copyinfo[idx].cookie != NULL) { + vmp_release((vm_page_t *)copyinfo[idx].cookie); + } } bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); } @@ -3495,24 +3414,26 @@ vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, - int num_copyinfo, int *fault) + uint_t num_copyinfo, int *fault) { - int error, idx, nused; + uint_t idx, nused; size_t n, off, remaining; - void *hva, *cookie; - uint64_t gpa; + vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); nused = 0; remaining = len; while (remaining > 0) { + uint64_t gpa; + int error; + KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); - off = gpa & PAGE_MASK; - n = min(remaining, PAGE_SIZE - off); + off = gpa & PAGEOFFSET; + n = min(remaining, PAGESIZE - off); copyinfo[nused].gpa = gpa; copyinfo[nused].len = n; remaining -= n; @@ -3521,12 +3442,21 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, } for (idx = 0; idx < nused; idx++) { - hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa, - copyinfo[idx].len, prot, &cookie); - if (hva == NULL) + vm_page_t *vmp; + caddr_t hva; + + vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); + if (vmp == NULL) { break; - copyinfo[idx].hva = hva; - copyinfo[idx].cookie = cookie; + } + if ((prot & PROT_WRITE) != 0) { + hva = (caddr_t)vmp_get_writable(vmp); + } else { + hva = (caddr_t)vmp_get_readable(vmp); + } + copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); + copyinfo[idx].cookie = vmp; + copyinfo[idx].prot = prot; } if (idx != nused) { @@ -3548,6 +3478,8 @@ vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, dst = kaddr; idx = 0; while (len > 0) { + ASSERT(copyinfo[idx].prot & PROT_READ); + bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); len -= copyinfo[idx].len; dst += copyinfo[idx].len; @@ -3565,6 +3497,8 @@ vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, src = kaddr; idx = 0; while (len > 0) { + ASSERT(copyinfo[idx].prot & PROT_WRITE); + bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); len -= copyinfo[idx].len; src += copyinfo[idx].len; @@ -3577,30 +3511,17 @@ vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, * these are global stats, only return the values with for vCPU 0 */ VMM_STAT_DECLARE(VMM_MEM_RESIDENT); -VMM_STAT_DECLARE(VMM_MEM_WIRED); static void vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) { - if (vcpu == 0) { vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * vmspace_resident_count(vm->vmspace)); } } -static void -vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) -{ - - if (vcpu == 0) { - vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, - PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); - } -} - VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); -VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); int vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, diff --git a/usr/src/uts/i86pc/io/vmm/vmm_gpt.c b/usr/src/uts/i86pc/io/vmm/vmm_gpt.c index 9f6cc44aac..146ad958a8 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_gpt.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_gpt.c @@ -92,7 +92,8 @@ struct vmm_gpt_node { vmm_gpt_node_t *vgn_children; vmm_gpt_node_t *vgn_siblings; uint64_t *vgn_entries; - uint64_t _vgn_pad[2]; + uint64_t vgn_gpa; + uint64_t _vgn_pad; }; /* @@ -107,7 +108,6 @@ struct vmm_gpt_node { struct vmm_gpt { vmm_gpt_node_t *vgpt_root; vmm_pte_ops_t *vgpt_pte_ops; - uint64_t vgpt_mapped_page_count; }; /* @@ -153,24 +153,6 @@ vmm_gpt_alloc(vmm_pte_ops_t *pte_ops) } /* - * Retrieves the host kernel address of the GPT root. - */ -void * -vmm_gpt_root_kaddr(vmm_gpt_t *gpt) -{ - return (gpt->vgpt_root->vgn_entries); -} - -/* - * Retrieves the host PFN of the GPT root. - */ -uint64_t -vmm_gpt_root_pfn(vmm_gpt_t *gpt) -{ - return (gpt->vgpt_root->vgn_host_pfn); -} - -/* * Frees the given node, first nulling out all of its links to other nodes in * the tree, adjusting its parents reference count, and unlinking itself from * its parents page table. @@ -310,11 +292,18 @@ vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child, ASSERT(gpt->vgpt_pte_ops != NULL); ASSERT(parent != NULL); ASSERT(child != NULL); + ASSERT3U(parent->vgn_level, <, LEVEL1); + const uint64_t gpa_mask[3] = { + [LEVEL4] = 0xffffff8000000000ul, /* entries cover 512G */ + [LEVEL3] = 0xffffffffc0000000ul, /* entries cover 1G */ + [LEVEL2] = 0xffffffffffe00000ul, /* entries cover 2M */ + }; const int index = vmm_gpt_node_index(gpa, parent->vgn_level); child->vgn_index = index; child->vgn_level = parent->vgn_level + 1; child->vgn_parent = parent; + child->vgn_gpa = gpa & gpa_mask[parent->vgn_level]; parent_entries = parent->vgn_entries; entry = gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn); parent_entries[index] = entry; @@ -338,12 +327,14 @@ vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child, * that this does not actually map the entry, but simply ensures that the * entries exist. */ -void +static void vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa) { vmm_gpt_node_t *node, *child; ASSERT(gpt != NULL); + ASSERT0(gpa & PAGEOFFSET); + node = gpt->vgpt_root; for (uint_t i = 0; i < LEVEL1; i++) { ASSERT(node != NULL); @@ -364,41 +355,53 @@ vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa) void vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) { + ASSERT0(start & PAGEOFFSET); + ASSERT0(end & PAGEOFFSET); + for (uint64_t page = start; page < end; page += PAGESIZE) { vmm_gpt_populate_entry(gpt, page); } } /* - * Inserts an entry for a given GPA into the table. The caller must - * ensure that the entry is not currently mapped, though note that this - * can race with another thread inserting the same page into the tree. - * If we lose the race, we ensure that the page we thought we were - * inserting is the page that was inserted. + * Format a PTE and install it in the provided PTE-pointer. */ bool -vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr) +vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot, + uint8_t attr) { - uint64_t *entries[MAX_GPT_LEVEL], entry, old_entry; - - ASSERT(gpt != NULL); - vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); - ASSERT(entries[LEVEL1] != NULL); + uint64_t entry, old_entry; entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr); - old_entry = atomic_cas_64(entries[LEVEL1], 0, entry); + old_entry = atomic_cas_64(ptep, 0, entry); if (old_entry != 0) { - ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), - ==, + ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==, gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry)); return (false); } - gpt->vgpt_mapped_page_count++; return (true); } /* + * Inserts an entry for a given GPA into the table. The caller must + * ensure that a conflicting PFN is not mapped at the requested location. + * Racing operations to map the same PFN at one location is acceptable and + * properly handled. + */ +bool +vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr) +{ + uint64_t *entries[MAX_GPT_LEVEL]; + + ASSERT(gpt != NULL); + vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); + ASSERT(entries[LEVEL1] != NULL); + + return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr)); +} + +/* * Removes a child node from its parent's list of children, and then frees * the now-orphaned child. */ @@ -421,9 +424,8 @@ vmm_gpt_node_remove_child(vmm_gpt_node_t *parent, vmm_gpt_node_t *child) } /* - * Cleans up unused inner nodes in the GPT. Asserts that the - * leaf corresponding to the entry does not map any additional - * pages. + * Cleans up unused inner nodes in the GPT. Asserts that the leaf corresponding + * to the entry does not map any additional pages. */ static void vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa) @@ -450,27 +452,28 @@ vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa) } /* - * Cleans up the unused inner nodes in the GPT for a region of guest - * physical address space bounded by [start..end). The region must - * map no pages. + * Cleans up the unused inner nodes in the GPT for a region of guest physical + * address space of [start, end). The region must map no pages. */ void vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) { + ASSERT0(start & PAGEOFFSET); + ASSERT0(end & PAGEOFFSET); + for (uint64_t page = start; page < end; page += PAGESIZE) { vmm_gpt_vacate_entry(gpt, page); } } /* - * Remove a mapping from the table. Returns false if the page was not - * mapped, otherwise returns true. + * Remove a mapping from the table. Returns false if the page was not mapped, + * otherwise returns true. */ bool vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa) { uint64_t *entries[MAX_GPT_LEVEL], entry; - bool was_mapped; ASSERT(gpt != NULL); vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); @@ -479,28 +482,27 @@ vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa) entry = *entries[LEVEL1]; *entries[LEVEL1] = 0; - was_mapped = gpt->vgpt_pte_ops->vpeo_pte_is_present(entry); - if (was_mapped) - gpt->vgpt_mapped_page_count--; - - return (was_mapped); + return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)); } /* - * Un-maps the region of guest physical address space bounded by - * [start..end). Returns the number of pages that are unmapped. + * Un-maps the region of guest physical address space bounded by [start..end). + * Returns the number of pages that are unmapped. */ size_t vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) { - size_t n = 0; + ASSERT0(start & PAGEOFFSET); + ASSERT0(end & PAGEOFFSET); + size_t num_unmapped = 0; for (uint64_t page = start; page < end; page += PAGESIZE) { - if (vmm_gpt_unmap(gpt, page) != 0) - n++; + if (vmm_gpt_unmap(gpt, page) != 0) { + num_unmapped++; + } } - return (n); + return (num_unmapped); } /* @@ -509,31 +511,23 @@ vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) * bits of the entry. Otherwise, it will be ignored. */ bool -vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t gpa, uint_t *protp) +vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp) { - uint64_t *entries[MAX_GPT_LEVEL], entry; + uint64_t entry; - vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); - if (entries[LEVEL1] == NULL) + if (ptep == NULL) { return (false); - entry = *entries[LEVEL1]; - if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) + } + entry = *ptep; + if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { return (false); + } + *pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry); - return (true); } /* - * Returns the number of pages that are mapped in by this GPT. - */ -size_t -vmm_gpt_mapped_count(vmm_gpt_t *gpt) -{ - return (gpt->vgpt_mapped_page_count); -} - -/* * Resets the accessed bit on the page table entry pointed to be `entry`. * If `on` is true, the bit will be set, otherwise it will be cleared. * The old value of the bit is returned. @@ -556,3 +550,12 @@ vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on) ASSERT(entry != NULL); return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on)); } + +/* + * Get properly formatted PML4 (EPTP/nCR3) for GPT. + */ +uint64_t +vmm_gpt_get_pmtp(vmm_gpt_t *gpt) +{ + return (gpt->vgpt_pte_ops->vpeo_get_pmtp(gpt->vgpt_root->vgn_host_pfn)); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c index 1dc2616599..d2a790ec03 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c @@ -373,6 +373,27 @@ static const struct vie_op one_byte_opcodes[256] = { #define GB (1024 * 1024 * 1024) + +/* + * Paging defines, previously pulled in from machine/pmap.h + */ +#define PG_V (1 << 0) /* Present */ +#define PG_RW (1 << 1) /* Read/Write */ +#define PG_U (1 << 2) /* User/Supervisor */ +#define PG_A (1 << 5) /* Accessed */ +#define PG_M (1 << 6) /* Dirty */ +#define PG_PS (1 << 7) /* Largepage */ + +/* + * Paging except defines, previously pulled in from machine/pmap.h + */ +#define PGEX_P (1 << 0) /* Non-present/Protection */ +#define PGEX_W (1 << 1) /* Read/Write */ +#define PGEX_U (1 << 2) /* User/Supervisor */ +#define PGEX_RSV (1 << 3) /* (Non-)Reserved */ +#define PGEX_I (1 << 4) /* Instruction */ + + static enum vm_reg_name gpr_map[16] = { VM_REG_GUEST_RAX, VM_REG_GUEST_RCX, @@ -2875,43 +2896,48 @@ pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) } static void -ptp_release(void **cookie) +ptp_release(vm_page_t **vmp) { - if (*cookie != NULL) { - vm_gpa_release(*cookie); - *cookie = NULL; + if (*vmp != NULL) { + vmp_release(*vmp); + *vmp = NULL; } } static void * -ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) +ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp) { - void *ptr; + vm_client_t *vmc = vm_get_vmclient(vm, vcpu); + const uintptr_t hold_gpa = gpa & PAGEMASK; + + /* Hold must not cross a page boundary */ + VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE); - ptp_release(cookie); - ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, PROT_READ | PROT_WRITE, - cookie); + if (*vmp != NULL) { + vmp_release(*vmp); + } + + *vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE); + if (*vmp == NULL) { + return (NULL); + } - return (ptr); + return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa)); } static int _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) { - int nlevels, pfcode, retval, usermode, writable; + int nlevels, pfcode; int ptpshift = 0, ptpindex = 0; uint64_t ptpphys; uint64_t *ptpbase = NULL, pte = 0, pgsize = 0; - uint32_t *ptpbase32, pte32; - void *cookie; + vm_page_t *cookie = NULL; + const bool usermode = paging->cpl == 3; + const bool writable = (prot & PROT_WRITE) != 0; *guest_fault = 0; - - usermode = (paging->cpl == 3 ? 1 : 0); - writable = prot & PROT_WRITE; - cookie = NULL; - retval = 0; restart: ptpphys = paging->cr3; /* root of the page tables */ ptp_release(&cookie); @@ -2923,15 +2949,18 @@ restart: */ if (!check_only) vm_inject_gp(vm, vcpuid); - goto fault; + *guest_fault = 1; + return (0); } if (paging->paging_mode == PAGING_MODE_FLAT) { *gpa = gla; - goto done; + return (0); } if (paging->paging_mode == PAGING_MODE_32) { + uint32_t *ptpbase32, pte32; + nlevels = 2; while (--nlevels >= 0) { /* Zero out the lower 12 bits. */ @@ -2940,8 +2969,9 @@ restart: ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); - if (ptpbase32 == NULL) - goto error; + if (ptpbase32 == NULL) { + return (EFAULT); + } ptpshift = PAGE_SHIFT + nlevels * 10; ptpindex = (gla >> ptpshift) & 0x3FF; @@ -2957,7 +2987,10 @@ restart: 0, pte32); vm_inject_pf(vm, vcpuid, pfcode, gla); } - goto fault; + + ptp_release(&cookie); + *guest_fault = 1; + return (0); } /* @@ -2992,7 +3025,8 @@ restart: /* Zero out the lower 'ptpshift' bits */ pte32 >>= ptpshift; pte32 <<= ptpshift; *gpa = pte32 | (gla & (pgsize - 1)); - goto done; + ptp_release(&cookie); + return (0); } if (paging->paging_mode == PAGING_MODE_PAE) { @@ -3001,8 +3035,9 @@ restart: ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4, &cookie); - if (ptpbase == NULL) - goto error; + if (ptpbase == NULL) { + return (EFAULT); + } ptpindex = (gla >> 30) & 0x3; @@ -3013,21 +3048,27 @@ restart: pfcode = pf_error_code(usermode, prot, 0, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); } - goto fault; + + ptp_release(&cookie); + *guest_fault = 1; + return (0); } ptpphys = pte; nlevels = 2; - } else + } else { nlevels = 4; + } + while (--nlevels >= 0) { /* Zero out the lower 12 bits and the upper 12 bits */ - ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; + ptpphys &= 0x000ffffffffff000UL; ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); - if (ptpbase == NULL) - goto error; + if (ptpbase == NULL) { + return (EFAULT); + } ptpshift = PAGE_SHIFT + nlevels * 9; ptpindex = (gla >> ptpshift) & 0x1FF; @@ -3042,7 +3083,10 @@ restart: pfcode = pf_error_code(usermode, prot, 0, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); } - goto fault; + + ptp_release(&cookie); + *guest_fault = 1; + return (0); } /* Set the accessed bit in the page table entry */ @@ -3060,7 +3104,10 @@ restart: 1, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); } - goto fault; + + ptp_release(&cookie); + *guest_fault = 1; + return (0); } break; } @@ -3073,21 +3120,12 @@ restart: if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) goto restart; } + ptp_release(&cookie); /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; *gpa = pte | (gla & (pgsize - 1)); -done: - ptp_release(&cookie); - KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", - __func__, retval)); - return (retval); -error: - retval = EFAULT; - goto done; -fault: - *guest_fault = 1; - goto done; + return (0); } int diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c index a5118c15af..e95f444051 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c @@ -46,7 +46,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> -#include <sys/smp.h> +#include <sys/cpuset.h> #include <x86/specialreg.h> #include <x86/apicreg.h> diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_mem.c deleted file mode 100644 index 4ffe5bf509..0000000000 --- a/usr/src/uts/i86pc/io/vmm/vmm_mem.c +++ /dev/null @@ -1,113 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2011 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mman.h> -#include <sys/sglist.h> -#include <sys/lock.h> -#include <sys/rwlock.h> - -#include <machine/md_var.h> -#include <machine/vm.h> -#include <sys/vmm_vm.h> - -#include "vmm_mem.h" - -int -vmm_mem_init(void) -{ - - return (0); -} - -vm_object_t -vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, - vm_paddr_t hpa) -{ - int error; - vm_object_t obj; - struct sglist *sg; - - sg = sglist_alloc(1, M_WAITOK); - error = sglist_append_phys(sg, hpa, len); - KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); - - const int prot = PROT_READ | PROT_WRITE; - obj = vm_pager_allocate(OBJT_SG, sg, len, prot, 0, NULL); - if (obj != NULL) { - /* - * VT-x ignores the MTRR settings when figuring out the - * memory type for translations obtained through EPT. - * - * Therefore we explicitly force the pages provided by - * this object to be mapped as uncacheable. - */ - VM_OBJECT_WLOCK(obj); - error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); - VM_OBJECT_WUNLOCK(obj); - if (error != 0) { - panic("vmm_mmio_alloc: vm_object_set_memattr error %d", - error); - } - error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, - VMFS_NO_SPACE, prot, prot, 0); - if (error != 0) { - vm_object_deallocate(obj); - obj = NULL; - } - } - - /* - * Drop the reference on the sglist. - * - * If the scatter/gather object was successfully allocated then it - * has incremented the reference count on the sglist. Dropping the - * initial reference count ensures that the sglist will be freed - * when the object is deallocated. - * - * If the object could not be allocated then we end up freeing the - * sglist. - */ - sglist_free(sg); - - return (obj); -} - -vm_paddr_t -vmm_mem_maxaddr(void) -{ - - return (ptoa(Maxmem)); -} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.h b/usr/src/uts/i86pc/io/vmm/vmm_mem.h deleted file mode 100644 index b27501eef2..0000000000 --- a/usr/src/uts/i86pc/io/vmm/vmm_mem.h +++ /dev/null @@ -1,54 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2011 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * Copyright 2013 Pluribus Networks Inc. - */ - -#ifndef _VMM_MEM_H_ -#define _VMM_MEM_H_ - -struct vmspace; -struct vm_object; - -int vmm_mem_init(void); -struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, - vm_paddr_t hpa); -vm_paddr_t vmm_mem_maxaddr(void); - -#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index 92d1494e04..823097b285 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -100,6 +100,7 @@ struct vmm_hold { struct vmm_lease { list_node_t vml_node; struct vm *vml_vm; + vm_client_t *vml_vmclient; boolean_t vml_expired; boolean_t vml_break_deferred; boolean_t (*vml_expire_func)(void *); @@ -444,7 +445,6 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, lock_type = LOCK_WRITE_HOLD; break; - case VM_GET_GPA_PMAP: case VM_GET_MEMSEG: case VM_MMAP_GETNEXT: case VM_LAPIC_IRQ: @@ -465,6 +465,7 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, lock_type = LOCK_READ_HOLD; break; + case VM_GET_GPA_PMAP: case VM_IOAPIC_PINCOUNT: case VM_SUSPEND: default: @@ -1127,18 +1128,11 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, break; } case VM_GET_GPA_PMAP: { - struct vm_gpa_pte gpapte; - - if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) { - error = EFAULT; - break; - } -#ifdef __FreeBSD__ - /* XXXJOY: add function? */ - pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)), - gpapte.gpa, gpapte.pte, &gpapte.ptenum); -#endif - error = 0; + /* + * Until there is a necessity to leak EPT/RVI PTE values to + * userspace, this will remain unimplemented + */ + error = EINVAL; break; } case VM_GET_HPET_CAPABILITIES: { @@ -1690,6 +1684,7 @@ vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) lease->vml_hold = hold; /* cache the VM pointer for one less pointer chase */ lease->vml_vm = sc->vmm_vm; + lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); mutex_enter(&sc->vmm_lease_lock); while (sc->vmm_lease_blocker != 0) { @@ -1709,6 +1704,7 @@ vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) list_remove(&sc->vmm_lease_list, lease); vmm_read_unlock(sc); + vmc_destroy(lease->vml_vmclient); kmem_free(lease, sizeof (*lease)); } @@ -1841,9 +1837,30 @@ vmm_drv_lease_expired(vmm_lease_t *lease) void * vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz) { + vm_page_t *vmp; + void *res = NULL; + ASSERT(lease != NULL); + ASSERT3U(sz, ==, PAGESIZE); + ASSERT0(gpa & PAGEOFFSET); + + vmp = vmc_hold(lease->vml_vmclient, gpa, PROT_READ | PROT_WRITE); + /* + * Break the rules for now and just extract the pointer. This is + * nominally safe, since holding a driver lease on the VM read-locks it. + * + * A pointer which would otherwise be at risk of being a use-after-free + * vector is made safe since actions such as vmspace_unmap() require + * acquisition of the VM write-lock, (causing all driver leases to be + * broken) allowing the consumers to cease their access prior to + * modification of the vmspace. + */ + if (vmp != NULL) { + res = vmp_get_writable(vmp); + vmp_release(vmp); + } - return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz)); + return (res); } int @@ -2191,6 +2208,14 @@ vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) minor_t minor; vmm_softc_t *sc; + /* + * Forbid running bhyve in a 32-bit process until it has been tested and + * verified to be safe. + */ + if (curproc->p_model != DATAMODEL_LP64) { + return (EFBIG); + } + minor = getminor(*devp); if (minor == VMM_CTL_MINOR) { /* @@ -2330,6 +2355,14 @@ vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, vmm_softc_t *sc; minor_t minor; + /* + * Forbid running bhyve in a 32-bit process until it has been tested and + * verified to be safe. + */ + if (curproc->p_model != DATAMODEL_LP64) { + return (EFBIG); + } + /* The structs in bhyve ioctls assume a 64-bit datamodel */ if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { return (ENOTSUP); @@ -2356,10 +2389,7 @@ vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, { vmm_softc_t *sc; const minor_t minor = getminor(dev); - struct vm *vm; int err; - vm_object_t vmo = NULL; - struct vmspace *vms; if (minor == VMM_CTL_MINOR) { return (ENODEV); @@ -2380,31 +2410,23 @@ vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, /* Grab read lock on the VM to prevent any changes to the memory map */ vmm_read_lock(sc); - vm = sc->vmm_vm; - vms = vm_get_vmspace(vm); if (off >= VM_DEVMEM_START) { int segid; - off_t map_off = 0; + off_t segoff; /* Mapping a devmem "device" */ - if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) { + if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { err = ENODEV; - goto out; - } - err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); - if (err != 0) { - goto out; + } else { + err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, + addrp, prot, maxprot, flags); } - err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot, - flags); } else { /* Mapping a part of the guest physical space */ - err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot, - flags); + err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, + maxprot, flags); } - -out: vmm_read_unlock(sc); return (err); } diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c index 3d357f37d2..fde4a030ce 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c @@ -21,17 +21,12 @@ #include <sys/kmem.h> #include <sys/machsystm.h> #include <sys/mman.h> +#include <sys/x86_archext.h> +#include <vm/hat_pte.h> #include <sys/vmm_gpt.h> #include <sys/vmm_vm.h> - -typedef struct ept_map ept_map_t; -struct ept_map { - vmm_gpt_t *em_gpt; - kmutex_t em_lock; -}; - #define EPT_R (1 << 0) #define EPT_W (1 << 1) #define EPT_X (1 << 2) @@ -42,6 +37,9 @@ struct ept_map { #define EPT_PA_MASK (0x000ffffffffff000ull) +#define EPT_MAX_LEVELS 4 +CTASSERT(EPT_MAX_LEVELS <= MAX_GPT_LEVEL); + CTASSERT(EPT_R == PROT_READ); CTASSERT(EPT_W == PROT_WRITE); CTASSERT(EPT_X == PROT_EXEC); @@ -121,7 +119,15 @@ ept_reset_accessed(uint64_t *entry, bool on) on ? EPT_ACCESSED : 0)); } -static vmm_pte_ops_t ept_pte_ops = { +static uint64_t +ept_get_pmtp(pfn_t root_pfn) +{ + /* TODO: enable AD tracking when required */ + return ((root_pfn << PAGESHIFT | + (EPT_MAX_LEVELS - 1) << 3 | MTRR_TYPE_WB)); +} + +vmm_pte_ops_t ept_pte_ops = { .vpeo_map_table = ept_map_table, .vpeo_map_page = ept_map_page, .vpeo_pte_pfn = ept_pte_pfn, @@ -129,100 +135,5 @@ static vmm_pte_ops_t ept_pte_ops = { .vpeo_pte_prot = ept_pte_prot, .vpeo_reset_dirty = ept_reset_dirty, .vpeo_reset_accessed = ept_reset_accessed, -}; - -vmm_gpt_t * -ept_create(void) -{ - return (vmm_gpt_alloc(&ept_pte_ops)); -} - -static void * -ept_ops_create(uintptr_t *root_kaddr) -{ - ept_map_t *map; - - map = kmem_zalloc(sizeof (*map), KM_SLEEP); - mutex_init(&map->em_lock, NULL, MUTEX_DEFAULT, NULL); - map->em_gpt = ept_create(); - *root_kaddr = (uintptr_t)vmm_gpt_root_kaddr(map->em_gpt); - - return (map); -} - -static void -ept_ops_destroy(void *arg) -{ - ept_map_t *map = arg; - - if (map != NULL) { - vmm_gpt_free(map->em_gpt); - mutex_destroy(&map->em_lock); - kmem_free(map, sizeof (*map)); - } -} - -static uint64_t -ept_ops_wired_count(void *arg) -{ - ept_map_t *map = arg; - uint64_t res; - - mutex_enter(&map->em_lock); - res = vmm_gpt_mapped_count(map->em_gpt); - mutex_exit(&map->em_lock); - - return (res); -} - -static int -ept_ops_is_wired(void *arg, uint64_t gpa, uint_t *protp) -{ - ept_map_t *map = arg; - bool mapped; - - mutex_enter(&map->em_lock); - mapped = vmm_gpt_is_mapped(map->em_gpt, gpa, protp); - mutex_exit(&map->em_lock); - - return (mapped ? 0 : -1); -} - -static int -ept_ops_map(void *arg, uint64_t gpa, pfn_t pfn, uint_t _lvl, uint_t prot, - uint8_t attr) -{ - ept_map_t *map = arg; - - ASSERT((prot & EPT_RWX) != 0 && (prot & ~EPT_RWX) == 0); - - mutex_enter(&map->em_lock); - vmm_gpt_populate_entry(map->em_gpt, gpa); - (void) vmm_gpt_map(map->em_gpt, gpa, pfn, prot, attr); - mutex_exit(&map->em_lock); - - return (0); -} - -static uint64_t -ept_ops_unmap(void *arg, uint64_t start, uint64_t end) -{ - ept_map_t *map = arg; - size_t unmapped = 0; - - mutex_enter(&map->em_lock); - unmapped = vmm_gpt_unmap_region(map->em_gpt, start, end); - vmm_gpt_vacate_region(map->em_gpt, start, end); - mutex_exit(&map->em_lock); - - return ((uint64_t)unmapped); -} - -struct vmm_pt_ops ept_ops = { - .vpo_init = ept_ops_create, - .vpo_free = ept_ops_destroy, - .vpo_wired_cnt = ept_ops_wired_count, - .vpo_is_wired = ept_ops_is_wired, - .vpo_map = ept_ops_map, - .vpo_unmap = ept_ops_unmap, + .vpeo_get_pmtp = ept_get_pmtp, }; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c index afd686f197..f78db731d6 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c @@ -60,7 +60,6 @@ #include <machine/cpufunc.h> #include <machine/fpu.h> #include <machine/md_var.h> -#include <machine/pmap.h> #include <machine/specialreg.h> #include <machine/vmm.h> #include <machine/vmparam.h> @@ -95,7 +94,7 @@ uint8_t const bin2bcd_data[] = { }; void -pmap_invalidate_cache(void) +invalidate_cache_all(void) { cpuset_t cpuset; @@ -108,7 +107,7 @@ pmap_invalidate_cache(void) } vm_paddr_t -pmap_kextract(vm_offset_t va) +vtophys(void *va) { pfn_t pfn; @@ -411,18 +410,6 @@ vmm_glue_callout_localize(struct callout *c) mutex_exit(&cpu_lock); } -void -ipi_cpu(int cpu, uint_t ipi) -{ - /* - * This was previously implemented as an invocation of asynchronous - * no-op crosscalls to interrupt the target CPU. Since even nowait - * crosscalls can block in certain circumstances, a direct poke_cpu() - * is safer when called from delicate contexts. - */ - poke_cpu(cpu); -} - uint_t cpu_high; /* Highest arg to CPUID */ uint_t cpu_exthigh; /* Highest arg to extended CPUID */ uint_t cpu_id; /* Stepping ID */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c index c66a4e7962..8b45782d25 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c @@ -28,12 +28,6 @@ #include <sys/vmm_gpt.h> #include <sys/vmm_vm.h> -typedef struct rvi_map rvi_map_t; -struct rvi_map { - vmm_gpt_t *rm_gpt; - kmutex_t rm_lock; -}; - static inline uint64_t rvi_prot(uint_t prot) { @@ -145,7 +139,13 @@ rvi_reset_accessed(uint64_t *entry, bool on) return (rvi_reset_bits(entry, (PT_MOD | PT_REF), on ? PT_REF : 0)); } -static vmm_pte_ops_t rvi_pte_ops = { +static uint64_t +rvi_get_pmtp(pfn_t root_pfn) +{ + return (root_pfn << PAGESHIFT); +} + +vmm_pte_ops_t rvi_pte_ops = { .vpeo_map_table = rvi_map_table, .vpeo_map_page = rvi_map_page, .vpeo_pte_pfn = rvi_pte_pfn, @@ -153,101 +153,5 @@ static vmm_pte_ops_t rvi_pte_ops = { .vpeo_pte_prot = rvi_pte_prot, .vpeo_reset_dirty = rvi_reset_dirty, .vpeo_reset_accessed = rvi_reset_accessed, -}; - -vmm_gpt_t * -rvi_create(void) -{ - return (vmm_gpt_alloc(&rvi_pte_ops)); -} - -static void * -rvi_ops_create(uintptr_t *root_kaddr) -{ - rvi_map_t *map; - - map = kmem_zalloc(sizeof (*map), KM_SLEEP); - mutex_init(&map->rm_lock, NULL, MUTEX_DEFAULT, NULL); - map->rm_gpt = rvi_create(); - *root_kaddr = (uintptr_t)vmm_gpt_root_kaddr(map->rm_gpt); - - return (map); -} - -static void -rvi_ops_destroy(void *arg) -{ - rvi_map_t *map = arg; - - if (map != NULL) { - vmm_gpt_free(map->rm_gpt); - mutex_destroy(&map->rm_lock); - kmem_free(map, sizeof (*map)); - } -} - -static uint64_t -rvi_ops_wired_count(void *arg) -{ - rvi_map_t *map = arg; - uint64_t res; - - mutex_enter(&map->rm_lock); - res = vmm_gpt_mapped_count(map->rm_gpt); - mutex_exit(&map->rm_lock); - - return (res); -} - -static int -rvi_ops_is_wired(void *arg, uint64_t gpa, uint_t *protp) -{ - rvi_map_t *map = arg; - bool mapped; - - mutex_enter(&map->rm_lock); - mapped = vmm_gpt_is_mapped(map->rm_gpt, gpa, protp); - mutex_exit(&map->rm_lock); - - return (mapped ? 0 : -1); -} - -static int -rvi_ops_map(void *arg, uint64_t gpa, pfn_t pfn, uint_t _lvl, uint_t prot, - uint8_t attr) -{ - rvi_map_t *map = arg; - - ASSERT((prot & PROT_READ) != 0); - ASSERT3U((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)), ==, 0); - - mutex_enter(&map->rm_lock); - vmm_gpt_populate_entry(map->rm_gpt, gpa); - (void) vmm_gpt_map(map->rm_gpt, gpa, pfn, prot, attr); - mutex_exit(&map->rm_lock); - - return (0); -} - -static uint64_t -rvi_ops_unmap(void *arg, uint64_t start, uint64_t end) -{ - rvi_map_t *map = arg; - size_t unmapped = 0; - - mutex_enter(&map->rm_lock); - unmapped = vmm_gpt_unmap_region(map->rm_gpt, start, end); - vmm_gpt_vacate_region(map->rm_gpt, start, end); - mutex_exit(&map->rm_lock); - - return ((uint64_t)unmapped); -} - -struct vmm_pt_ops rvi_ops = { - .vpo_init = rvi_ops_create, - .vpo_free = rvi_ops_destroy, - .vpo_wired_cnt = rvi_ops_wired_count, - .vpo_is_wired = rvi_ops_is_wired, - .vpo_map = rvi_ops_map, - .vpo_unmap = rvi_ops_unmap, + .vpeo_get_pmtp = rvi_get_pmtp, }; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c deleted file mode 100644 index bd1f1890d4..0000000000 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c +++ /dev/null @@ -1,932 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ -/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ - -/* - * Copyright 2019 Joyent, Inc. - * Copyright 2021 Oxide Computer Company - * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. - */ - -#include <sys/param.h> -#include <sys/kmem.h> -#include <sys/thread.h> -#include <sys/list.h> -#include <sys/mman.h> -#include <sys/types.h> -#include <sys/ddi.h> -#include <sys/sysmacros.h> -#include <sys/machsystm.h> -#include <sys/vmsystm.h> -#include <sys/malloc.h> -#include <sys/x86_archext.h> -#include <vm/as.h> -#include <vm/hat_i86.h> -#include <vm/seg_vn.h> -#include <vm/seg_kmem.h> - -#include <machine/vm.h> -#include <sys/vmm_gpt.h> -#include <sys/vmm_vm.h> -#include <sys/seg_vmm.h> -#include <sys/vmm_reservoir.h> - -#define PMAP_TO_VMMAP(pm) ((vm_map_t) \ - ((caddr_t)(pm) - offsetof(struct vmspace, vms_pmap))) -#define VMMAP_TO_VMSPACE(vmmap) ((struct vmspace *) \ - ((caddr_t)(vmmap) - offsetof(struct vmspace, vm_map))) - - -struct vmspace_mapping { - list_node_t vmsm_node; - vm_object_t vmsm_object; - uintptr_t vmsm_addr; - size_t vmsm_len; - off_t vmsm_offset; - uint_t vmsm_prot; -}; -typedef struct vmspace_mapping vmspace_mapping_t; - -#define VMSM_OFFSET(vmsm, addr) ( \ - (vmsm)->vmsm_offset + \ - ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) - - -/* Private glue interfaces */ -static void pmap_free(pmap_t); -static vmspace_mapping_t *vm_mapping_find(struct vmspace *, uintptr_t, size_t, - boolean_t); -static void vm_mapping_remove(struct vmspace *, vmspace_mapping_t *); - -struct vmspace * -vmspace_alloc(vm_offset_t start, vm_offset_t end, pmap_pinit_t pinit) -{ - struct vmspace *vms; - const uintptr_t size = end + 1; - - /* - * This whole mess is built on the assumption that a 64-bit address - * space is available to work with for the various pagetable tricks. - */ - VERIFY(ttoproc(curthread)->p_model == DATAMODEL_LP64); - VERIFY(start == 0 && size > 0 && (size & PAGEOFFSET) == 0 && - size <= (uintptr_t)USERLIMIT); - - vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); - vms->vms_size = size; - list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), - offsetof(vmspace_mapping_t, vmsm_node)); - - if (pinit(&vms->vms_pmap) == 0) { - kmem_free(vms, sizeof (*vms)); - return (NULL); - } - - return (vms); -} - -void -vmspace_free(struct vmspace *vms) -{ - VERIFY(list_is_empty(&vms->vms_maplist)); - - pmap_free(&vms->vms_pmap); - kmem_free(vms, sizeof (*vms)); -} - -pmap_t -vmspace_pmap(struct vmspace *vms) -{ - return (&vms->vms_pmap); -} - -long -vmspace_resident_count(struct vmspace *vms) -{ - /* XXXJOY: finish */ - return (0); -} - -void * -vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size) -{ - vmspace_mapping_t *vmsm; - void *result = NULL; - - /* - * Since vmspace_find_kva is provided so that vmm_drv consumers can do - * GPA2KVA translations, it is expected to be called when there is a - * read lock preventing vmspace alterations. As such, it can do the - * lockless vm_mapping_find() lookup. - */ - vmsm = vm_mapping_find(vms, addr, size, B_TRUE); - if (vmsm != NULL) { - struct vm_object *vmo = vmsm->vmsm_object; - - switch (vmo->vmo_type) { - case OBJT_DEFAULT: - result = vmmr_region_mem_at( - (vmmr_region_t *)vmo->vmo_data, - VMSM_OFFSET(vmsm, addr) & PAGEMASK); - break; - default: - break; - } - } - - return (result); -} - -static int -vmspace_pmap_iswired(struct vmspace *vms, uintptr_t addr, uint_t *prot) -{ - pmap_t pmap = &vms->vms_pmap; - int rv; - - ASSERT(MUTEX_HELD(&vms->vms_lock)); - - rv = pmap->pm_ops->vpo_is_wired(pmap->pm_impl, addr, prot); - return (rv); -} - -static void -pmap_free(pmap_t pmap) -{ - void *pmi = pmap->pm_impl; - struct vmm_pt_ops *ops = pmap->pm_ops; - - pmap->pm_pml4 = NULL; - pmap->pm_impl = NULL; - pmap->pm_ops = NULL; - - ops->vpo_free(pmi); -} - -int -pmap_pinit_type(pmap_t pmap, enum pmap_type type, int flags) -{ - /* For use in vmm only */ - pmap->pm_type = type; - switch (type) { - case PT_EPT: { - struct vmm_pt_ops *ops = &ept_ops; - void *pml4, *pmi; - - pmi = ops->vpo_init((uintptr_t *)&pml4); - - pmap->pm_ops = ops; - pmap->pm_impl = pmi; - pmap->pm_pml4 = pml4; - return (1); - } - case PT_RVI: { - struct vmm_pt_ops *ops = &rvi_ops; - void *pml4, *pmi; - - pmi = ops->vpo_init((uintptr_t *)&pml4); - - pmap->pm_ops = ops; - pmap->pm_impl = pmi; - pmap->pm_pml4 = pml4; - return (1); - } - default: - panic("unsupported pmap type: %x", type); - break; - } - - return (1); -} - -long -pmap_wired_count(pmap_t pmap) -{ - long val; - - val = pmap->pm_ops->vpo_wired_cnt(pmap->pm_impl); - VERIFY3S(val, >=, 0); - - return (val); -} - -int -pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) -{ - /* Allow the fallback to vm_fault to handle this */ - return (-1); -} - - - -struct sglist_ent { - vm_paddr_t sge_pa; - size_t sge_len; -}; -struct sglist { - kmutex_t sg_lock; - uint_t sg_refcnt; - uint_t sg_len; - uint_t sg_next; - struct sglist_ent sg_entries[]; -}; - -#define SG_SIZE(cnt) (sizeof (struct sglist) + \ - (sizeof (struct sglist_ent) * (cnt))) - -struct sglist * -sglist_alloc(int nseg, int flags) -{ - const size_t sz = SG_SIZE(nseg); - const int flag = (flags & M_WAITOK) ? KM_SLEEP : KM_NOSLEEP; - struct sglist *sg; - - ASSERT(nseg > 0); - - sg = kmem_zalloc(sz, flag); - if (sg != NULL) { - sg->sg_len = nseg; - sg->sg_refcnt = 1; - } - return (sg); -} - -void -sglist_free(struct sglist *sg) -{ - size_t sz; - - mutex_enter(&sg->sg_lock); - if (sg->sg_refcnt > 1) { - sg->sg_refcnt--; - mutex_exit(&sg->sg_lock); - return; - } - - VERIFY(sg->sg_refcnt == 1); - sg->sg_refcnt = 0; - sz = SG_SIZE(sg->sg_len); - mutex_exit(&sg->sg_lock); - kmem_free(sg, sz); -} - -int -sglist_append_phys(struct sglist *sg, vm_paddr_t pa, size_t len) -{ - uint_t idx; - struct sglist_ent *ent; - - /* Restrict to page-aligned entries */ - if ((pa & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0 || len == 0) { - return (EINVAL); - } - - mutex_enter(&sg->sg_lock); - idx = sg->sg_next; - if (idx >= sg->sg_len) { - mutex_exit(&sg->sg_lock); - return (ENOSPC); - } - - ent = &sg->sg_entries[idx]; - ASSERT(ent->sge_pa == 0 && ent->sge_len == 0); - ent->sge_pa = pa; - ent->sge_len = len; - sg->sg_next++; - - mutex_exit(&sg->sg_lock); - return (0); -} - - -static pfn_t -vm_object_pager_none(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) -{ - panic("bad vm_object pager"); - return (PFN_INVALID); -} - -static pfn_t -vm_object_pager_reservoir(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, - uint_t *lvl) -{ - vmmr_region_t *region; - pfn_t pfn; - - ASSERT(vmo->vmo_type == OBJT_DEFAULT); - - region = vmo->vmo_data; - pfn = vmmr_region_pfn_at(region, off & PAGEMASK); - - /* TODO: handle large pages */ - if (lpfn != NULL) { - *lpfn = pfn; - } - if (lvl != NULL) { - *lvl = 0; - } - return (pfn); -} - -static pfn_t -vm_object_pager_sg(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) -{ - const uintptr_t aoff = ALIGN2PAGE(off); - uint_t level = 0; - uintptr_t pos = 0; - struct sglist *sg; - struct sglist_ent *ent; - pfn_t pfn = PFN_INVALID; - - ASSERT(vmo->vmo_type == OBJT_SG); - ASSERT(off < vmo->vmo_size); - - sg = vmo->vmo_data; - if (sg == NULL) { - return (PFN_INVALID); - } - - ent = &sg->sg_entries[0]; - for (uint_t i = 0; i < sg->sg_next; i++, ent++) { - if (aoff >= pos && aoff < (pos + ent->sge_len)) { - /* XXXJOY: Punt on large pages for now */ - level = 0; - pfn = mmu_btop(ent->sge_pa + (aoff - pos)); - break; - } - pos += ent->sge_len; - } - - if (lpfn != 0) { - *lpfn = pfn; - } - if (lvl != 0) { - *lvl = level; - } - return (pfn); -} - -vm_object_t -vm_object_allocate(objtype_t type, vm_pindex_t psize, bool transient) -{ - vm_object_t vmo; - const size_t size = ptob((size_t)psize); - - vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); - mutex_init(&vmo->vmo_lock, NULL, MUTEX_DEFAULT, NULL); - - /* For now, these are to stay fixed after allocation */ - vmo->vmo_type = type; - vmo->vmo_size = size; - vmo->vmo_attr = VM_MEMATTR_DEFAULT; - - switch (type) { - case OBJT_DEFAULT: { - - /* TODO: opt-in to larger pages? */ - int err; - vmmr_region_t *region = NULL; - - err = vmmr_alloc(size, transient, ®ion); - if (err != 0) { - mutex_destroy(&vmo->vmo_lock); - kmem_free(vmo, sizeof (*vmo)); - return (NULL); - } - vmo->vmo_data = region; - vmo->vmo_pager = vm_object_pager_reservoir; - } - break; - case OBJT_SG: - vmo->vmo_data = NULL; - vmo->vmo_pager = vm_object_pager_sg; - break; - default: - panic("Unsupported vm_object type"); - break; - } - - vmo->vmo_refcnt = 1; - return (vmo); -} - -vm_object_t -vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, - vm_prot_t prot, vm_ooffset_t off, void *cred) -{ - struct vm_object *vmo; - struct sglist *sg = (struct sglist *)handle; - - /* XXXJOY: be very restrictive for now */ - VERIFY(type == OBJT_SG); - VERIFY(off == 0); - - vmo = vm_object_allocate(type, size, false); - vmo->vmo_data = sg; - - mutex_enter(&sg->sg_lock); - VERIFY(sg->sg_refcnt++ >= 1); - mutex_exit(&sg->sg_lock); - - return (vmo); -} - -void -vm_object_deallocate(vm_object_t vmo) -{ - ASSERT(vmo != NULL); - - uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); - /* underflow would be a deadly serious mistake */ - VERIFY3U(ref, !=, UINT_MAX); - if (ref != 0) { - return; - } - - switch (vmo->vmo_type) { - case OBJT_DEFAULT: - vmmr_free((vmmr_region_t *)vmo->vmo_data); - break; - case OBJT_SG: - sglist_free((struct sglist *)vmo->vmo_data); - break; - default: - panic("Unsupported vm_object type"); - break; - } - - vmo->vmo_pager = vm_object_pager_none; - vmo->vmo_data = NULL; - vmo->vmo_size = 0; - mutex_destroy(&vmo->vmo_lock); - kmem_free(vmo, sizeof (*vmo)); -} - -CTASSERT(VM_MEMATTR_UNCACHEABLE == MTRR_TYPE_UC); -CTASSERT(VM_MEMATTR_WRITE_BACK == MTRR_TYPE_WB); -int -vm_object_set_memattr(vm_object_t vmo, vm_memattr_t attr) -{ - ASSERT(MUTEX_HELD(&vmo->vmo_lock)); - - switch (attr) { - case VM_MEMATTR_UNCACHEABLE: - case VM_MEMATTR_WRITE_BACK: - vmo->vmo_attr = attr; - return (0); - default: - break; - } - return (EINVAL); -} - -void -vm_object_reference(vm_object_t vmo) -{ - ASSERT(vmo != NULL); - - uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); - /* overflow would be a deadly serious mistake */ - VERIFY3U(ref, !=, 0); -} - -pfn_t -vm_object_pfn(vm_object_t vmo, uintptr_t off) -{ - /* This is expected to be used only on reservoir-backed memory */ - if (vmo->vmo_type != OBJT_DEFAULT) { - return (PFN_INVALID); - } - - return (vmo->vmo_pager(vmo, off, NULL, NULL)); -} - -static vmspace_mapping_t * -vm_mapping_find(struct vmspace *vms, uintptr_t addr, size_t size, - boolean_t no_lock) -{ - vmspace_mapping_t *vmsm; - list_t *ml = &vms->vms_maplist; - const uintptr_t range_end = addr + size; - - ASSERT(addr <= range_end); - - if (no_lock) { - /* - * This check should be superflous with the protections - * promised by the bhyve logic which calls into the VM shim. - * All the same, it is cheap to be paranoid. - */ - VERIFY(!vms->vms_map_changing); - } else { - VERIFY(MUTEX_HELD(&vms->vms_lock)); - } - - if (addr >= vms->vms_size) { - return (NULL); - } - for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { - const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; - - if (addr >= vmsm->vmsm_addr && addr < seg_end) { - if (range_end <= seg_end) { - return (vmsm); - } else { - return (NULL); - } - } - } - return (NULL); -} - -static boolean_t -vm_mapping_gap(struct vmspace *vms, uintptr_t addr, size_t size) -{ - vmspace_mapping_t *vmsm; - list_t *ml = &vms->vms_maplist; - const uintptr_t range_end = addr + size - 1; - - ASSERT(MUTEX_HELD(&vms->vms_lock)); - ASSERT(size > 0); - - for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { - const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; - - /* - * The two ranges do not overlap if the start of either of - * them is after the end of the other. - */ - if (vmsm->vmsm_addr > range_end || addr > seg_end) - continue; - return (B_FALSE); - } - return (B_TRUE); -} - -static void -vm_mapping_remove(struct vmspace *vms, vmspace_mapping_t *vmsm) -{ - list_t *ml = &vms->vms_maplist; - - ASSERT(MUTEX_HELD(&vms->vms_lock)); - ASSERT(vms->vms_map_changing); - - list_remove(ml, vmsm); - vm_object_deallocate(vmsm->vmsm_object); - kmem_free(vmsm, sizeof (*vmsm)); -} - -int -vm_fault(vm_map_t map, vm_offset_t off, vm_prot_t type, int flag) -{ - struct vmspace *vms = VMMAP_TO_VMSPACE(map); - pmap_t pmap = &vms->vms_pmap; - void *pmi = pmap->pm_impl; - const uintptr_t addr = off; - vmspace_mapping_t *vmsm; - struct vm_object *vmo; - uint_t prot, map_lvl; - pfn_t pfn; - uintptr_t map_addr; - - mutex_enter(&vms->vms_lock); - if (vmspace_pmap_iswired(vms, addr, &prot) == 0) { - int err = 0; - - /* - * It is possible that multiple vCPUs will race to fault-in a - * given address. In such cases, the race loser(s) will - * encounter the already-mapped page, needing to do nothing - * more than consider it a success. - * - * If the fault exceeds protection, it is an obvious error. - */ - if ((prot & type) != type) { - err = FC_PROT; - } - - mutex_exit(&vms->vms_lock); - return (err); - } - - /* Try to wire up the address */ - if ((vmsm = vm_mapping_find(vms, addr, 0, B_FALSE)) == NULL) { - mutex_exit(&vms->vms_lock); - return (FC_NOMAP); - } - vmo = vmsm->vmsm_object; - prot = vmsm->vmsm_prot; - - /* XXXJOY: punt on large pages for now */ - pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, addr), NULL, NULL); - map_lvl = 0; - map_addr = P2ALIGN((uintptr_t)addr, LEVEL_SIZE(map_lvl)); - VERIFY(pfn != PFN_INVALID); - - /* - * If pmap failure is to be handled, the previously acquired page locks - * would need to be released. - */ - VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, prot, - vmo->vmo_attr)); - pmap->pm_eptgen++; - - mutex_exit(&vms->vms_lock); - return (0); -} - -int -vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, - vm_prot_t prot, vm_page_t *ma, int max_count) -{ - struct vmspace *vms = VMMAP_TO_VMSPACE(map); - const uintptr_t vaddr = addr; - vmspace_mapping_t *vmsm; - struct vm_object *vmo; - vm_page_t vmp; - - ASSERT0(addr & PAGEOFFSET); - ASSERT(len == PAGESIZE); - ASSERT(max_count == 1); - - /* - * Unlike practically all of the other logic that queries or - * manipulates vmspace objects, vm_fault_quick_hold_pages() does so - * without holding vms_lock. This is safe because bhyve ensures that - * changes to the vmspace map occur only when all other threads have - * been excluded from running. - * - * Since this task can count on vms_maplist remaining static and does - * not need to modify the pmap (like vm_fault might), it can proceed - * without the lock. The vm_object has independent refcount and lock - * protection, while the vmo_pager methods do not rely on vms_lock for - * safety. - * - * Performing this work without locks is critical in cases where - * multiple vCPUs require simultaneous instruction emulation, such as - * for frequent guest APIC accesses on a host that lacks hardware - * acceleration for that behavior. - */ - if ((vmsm = vm_mapping_find(vms, vaddr, PAGESIZE, B_TRUE)) == NULL || - (prot & ~vmsm->vmsm_prot) != 0) { - return (-1); - } - - vmp = kmem_zalloc(sizeof (struct vm_page), KM_SLEEP); - - vmo = vmsm->vmsm_object; - vm_object_reference(vmo); - vmp->vmp_obj_held = vmo; - vmp->vmp_pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, vaddr), NULL, - NULL); - - *ma = vmp; - return (1); -} - -/* - * Find a suitable location for a mapping (and install it). - */ -int -vm_map_find(vm_map_t map, vm_object_t vmo, vm_ooffset_t off, vm_offset_t *addr, - vm_size_t len, vm_offset_t max_addr, int find_flags, vm_prot_t prot, - vm_prot_t prot_max, int cow) -{ - struct vmspace *vms = VMMAP_TO_VMSPACE(map); - const size_t size = (size_t)len; - const uintptr_t uoff = (uintptr_t)off; - uintptr_t base = *addr; - vmspace_mapping_t *vmsm; - int res = 0; - - /* For use in vmm only */ - VERIFY(find_flags == VMFS_NO_SPACE); /* essentially MAP_FIXED */ - VERIFY(max_addr == 0); - - if (size == 0 || off < 0 || - uoff >= (uoff + size) || vmo->vmo_size < (uoff + size)) { - return (EINVAL); - } - - if (*addr >= vms->vms_size) { - return (ENOMEM); - } - - vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); - - mutex_enter(&vms->vms_lock); - vms->vms_map_changing = B_TRUE; - if (!vm_mapping_gap(vms, base, size)) { - res = ENOMEM; - goto out; - } - - if (res == 0) { - vmsm->vmsm_object = vmo; - vmsm->vmsm_addr = base; - vmsm->vmsm_len = len; - vmsm->vmsm_offset = (off_t)uoff; - vmsm->vmsm_prot = prot; - list_insert_tail(&vms->vms_maplist, vmsm); - - /* Communicate out the chosen address. */ - *addr = (vm_offset_t)base; - } -out: - vms->vms_map_changing = B_FALSE; - mutex_exit(&vms->vms_lock); - if (res != 0) { - kmem_free(vmsm, sizeof (*vmsm)); - } - return (res); -} - -int -vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) -{ - struct vmspace *vms = VMMAP_TO_VMSPACE(map); - pmap_t pmap = &vms->vms_pmap; - void *pmi = pmap->pm_impl; - const uintptr_t addr = start; - const size_t size = (size_t)(end - start); - vmspace_mapping_t *vmsm; - - ASSERT(start < end); - - mutex_enter(&vms->vms_lock); - vms->vms_map_changing = B_TRUE; - /* expect to match existing mapping exactly */ - if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL || - vmsm->vmsm_addr != addr || vmsm->vmsm_len != size) { - vms->vms_map_changing = B_FALSE; - mutex_exit(&vms->vms_lock); - return (ENOENT); - } - - (void) pmap->pm_ops->vpo_unmap(pmi, addr, end); - pmap->pm_eptgen++; - - vm_mapping_remove(vms, vmsm); - vms->vms_map_changing = B_FALSE; - mutex_exit(&vms->vms_lock); - return (0); -} - -int -vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) -{ - struct vmspace *vms = VMMAP_TO_VMSPACE(map); - pmap_t pmap = &vms->vms_pmap; - void *pmi = pmap->pm_impl; - const uintptr_t addr = start; - const size_t size = end - start; - vmspace_mapping_t *vmsm; - struct vm_object *vmo; - uint_t prot; - - mutex_enter(&vms->vms_lock); - - /* For the time being, only exact-match mappings are expected */ - if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) { - mutex_exit(&vms->vms_lock); - return (FC_NOMAP); - } - vmo = vmsm->vmsm_object; - prot = vmsm->vmsm_prot; - - for (uintptr_t pos = addr; pos < end; ) { - pfn_t pfn; - uintptr_t pg_size, map_addr; - uint_t map_lvl = 0; - - /* XXXJOY: punt on large pages for now */ - pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, pos), NULL, NULL); - pg_size = LEVEL_SIZE(map_lvl); - map_addr = P2ALIGN(pos, pg_size); - VERIFY(pfn != PFN_INVALID); - - VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, - prot, vmo->vmo_attr)); - vms->vms_pmap.pm_eptgen++; - - pos += pg_size; - } - - mutex_exit(&vms->vms_lock); - - return (0); -} - -/* Provided custom for bhyve 'devmem' segment mapping */ -int -vm_segmap_obj(vm_object_t vmo, off_t map_off, size_t size, struct as *as, - caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) -{ - int err; - - VERIFY(map_off >= 0); - VERIFY(size <= vmo->vmo_size); - VERIFY((size + map_off) <= vmo->vmo_size); - - if (vmo->vmo_type != OBJT_DEFAULT) { - /* Only support default objects for now */ - return (ENOTSUP); - } - - as_rangelock(as); - - err = choose_addr(as, addrp, size, 0, ADDR_VACALIGN, flags); - if (err == 0) { - segvmm_crargs_t svma; - - svma.obj = vmo; - svma.offset = map_off; - svma.prot = prot; - - err = as_map(as, *addrp, size, segvmm_create, &svma); - } - - as_rangeunlock(as); - return (err); -} - -int -vm_segmap_space(struct vmspace *vms, off_t off, struct as *as, caddr_t *addrp, - off_t len, uint_t prot, uint_t maxprot, uint_t flags) -{ - const uintptr_t addr = (uintptr_t)off; - const size_t size = (uintptr_t)len; - vmspace_mapping_t *vmsm; - vm_object_t vmo; - int err; - - if (off < 0 || len <= 0 || - (addr & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { - return (EINVAL); - } - - mutex_enter(&vms->vms_lock); - if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) { - mutex_exit(&vms->vms_lock); - return (ENXIO); - } - if ((prot & ~(vmsm->vmsm_prot | PROT_USER)) != 0) { - mutex_exit(&vms->vms_lock); - return (EACCES); - } - vmo = vmsm->vmsm_object; - if (vmo->vmo_type != OBJT_DEFAULT) { - /* Only support default objects for now */ - mutex_exit(&vms->vms_lock); - return (ENOTSUP); - } - - as_rangelock(as); - - err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); - if (err == 0) { - segvmm_crargs_t svma; - const uintptr_t addroff = addr - vmsm->vmsm_addr; - const uintptr_t mapoff = addroff + vmsm->vmsm_offset; - - VERIFY(addroff < vmsm->vmsm_len); - VERIFY((vmsm->vmsm_len - addroff) >= size); - VERIFY(mapoff < vmo->vmo_size); - VERIFY((mapoff + size) <= vmo->vmo_size); - - svma.obj = vmo; - svma.offset = mapoff; - svma.prot = prot; - - err = as_map(as, *addrp, len, segvmm_create, &svma); - } - - as_rangeunlock(as); - mutex_exit(&vms->vms_lock); - return (err); -} - -void -vm_page_unwire(vm_page_t vmp, uint8_t nqueue __unused) -{ - ASSERT(!MUTEX_HELD(&vmp->vmp_lock)); - mutex_enter(&vmp->vmp_lock); - - VERIFY(vmp->vmp_pfn != PFN_INVALID); - - vm_object_deallocate(vmp->vmp_obj_held); - vmp->vmp_obj_held = NULL; - vmp->vmp_pfn = PFN_INVALID; - - mutex_exit(&vmp->vmp_lock); - - mutex_destroy(&vmp->vmp_lock); - kmem_free(vmp, sizeof (*vmp)); -} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_vm.c new file mode 100644 index 0000000000..debeec605a --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_vm.c @@ -0,0 +1,1430 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/list.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sysmacros.h> +#include <sys/machsystm.h> +#include <sys/vmsystm.h> +#include <sys/malloc.h> +#include <sys/x86_archext.h> +#include <vm/as.h> +#include <vm/hat_i86.h> +#include <vm/seg_vn.h> +#include <vm/seg_kmem.h> + +#include <sys/vmm_vm.h> +#include <sys/seg_vmm.h> +#include <sys/vmm_kernel.h> +#include <sys/vmm_reservoir.h> +#include <sys/vmm_gpt.h> + + +/* + * VMM Virtual Memory + * + * History + * + * When bhyve was ported to illumos, one significant hole was handling guest + * memory and memory accesses. In the original Pluribus port, bhyve itself + * manually handled the EPT structures for guest memory. The updated sources + * (from FreeBSD 11) took a different approach, using the native FreeBSD VM + * system for memory allocations and management of the EPT structures. Keeping + * source differences to a minimum was a priority, so illumos-bhyve implemented + * a makeshift "VM shim" which exposed the bare minimum of those interfaces to + * boot and run guests. + * + * While the VM shim was successful in getting illumos-bhyve to a functional + * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the + * compatibility interfaces made it awkward to use. As source differences with + * the upstream kernel code became less of a concern, and upcoming features + * (such as live migration) would demand more of those VM interfaces, it became + * clear that an overhaul was prudent. + * + * Design + * + * The new VM system for bhyve retains a number of the same concepts as what it + * replaces: + * + * - `vmspace_t` is the top-level entity for a guest memory space + * - `vm_object_t` represents a memory object which can be mapped into a vmspace + * - `vm_page_t` represents a page hold within a given vmspace, providing access + * to the underlying memory page + * + * Unlike the old code, where most of the involved structures were exposed via + * public definitions, this replacement VM interface keeps all involved + * structures opaque to consumers. Furthermore, there is a clear delineation + * between infrequent administrative operations (such as mapping/unmapping + * regions) and common data-path operations (attempting a page hold at a given + * guest-physical address). Those administrative operations are performed + * directly against the vmspace, whereas the data-path operations are performed + * through a `vm_client_t` handle. That VM client abstraction is meant to + * reduce contention and overhead for frequent access operations and provide + * debugging insight into how different subcomponents are accessing the vmspace. + * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv + * interface) and each VMM userspace segment mapping. + * + * Exclusion + * + * Making changes to the vmspace (such as mapping or unmapping regions) requires + * other accessors be excluded while the change is underway to prevent them from + * observing invalid intermediate states. A simple approach could use a mutex + * or rwlock to achieve this, but that risks contention when the rate of access + * to the vmspace is high. + * + * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion + * at a per-vm_client_t basis. While this raises the cost for vmspace changes, + * it means that the much more common page accesses through the vm_client can + * normally proceed unimpeded and independently. + * + * When a change to the vmspace is required, the caller will put the vmspace in + * a 'hold' state, iterating over all associated vm_client instances, waiting + * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before + * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on + * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) + * will block until the hold condition is cleared. Once the hold is asserted + * for all clients, the vmspace change can proceed with confidence. Upon + * completion of that operation, VCS_HOLD is cleared from the clients, and they + * are released to resume vmspace accesses. + * + * vCPU Consumers + * + * Access to the vmspace for vCPUs running in guest context is different from + * emulation-related vm_client activity: they solely rely on the contents of the + * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude + * client access is not feasible when entering guest context, since interrupts + * are disabled, making it impossible to block entry. This is not a concern as + * long as vmspace modifications never place the page tables in invalid states + * (either intermediate, or final). The vm_client hold mechanism does provide + * the means to IPI vCPU consumers which will trigger a notification once they + * report their exit from guest context. This can be used to ensure that page + * table modifications are made visible to those vCPUs within a certain + * time frame. + */ + +typedef struct vmspace_mapping { + list_node_t vmsm_node; + vm_object_t *vmsm_object; /* object backing this mapping */ + uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ + size_t vmsm_len; /* length (in bytes) of mapping */ + off_t vmsm_offset; /* byte offset into object */ + uint_t vmsm_prot; +} vmspace_mapping_t; + +#define VMSM_OFFSET(vmsm, addr) ( \ + (vmsm)->vmsm_offset + \ + ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) + +typedef enum vm_client_state { + VCS_IDLE = 0, + /* currently accessing vmspace for client operation (hold or fault) */ + VCS_ACTIVE = (1 << 0), + /* client hold requested/asserted */ + VCS_HOLD = (1 << 1), + /* vCPU is accessing page tables in guest context */ + VCS_ON_CPU = (1 << 2), + /* client has been orphaned (no more access to vmspace) */ + VCS_ORPHANED = (1 << 3), + /* client undergoing destroy operation */ + VCS_DESTROY = (1 << 4), +} vm_client_state_t; + +struct vmspace { + kmutex_t vms_lock; + kcondvar_t vms_cv; + bool vms_held; + uintptr_t vms_size; /* immutable after creation */ + + /* (nested) page table state */ + vmm_gpt_t *vms_gpt; + uint64_t vms_pt_gen; + uint64_t vms_pages_mapped; + bool vms_track_dirty; + + list_t vms_maplist; + list_t vms_clients; +}; + +struct vm_client { + vmspace_t *vmc_space; + list_node_t vmc_node; + + kmutex_t vmc_lock; + kcondvar_t vmc_cv; + vm_client_state_t vmc_state; + int vmc_cpu_active; + uint64_t vmc_cpu_gen; + bool vmc_track_dirty; + vmc_inval_cb_t vmc_inval_func; + void *vmc_inval_data; + + list_t vmc_held_pages; +}; + +typedef enum vm_object_type { + VMOT_NONE, + VMOT_MEM, + VMOT_MMIO, +} vm_object_type_t; + +struct vm_object { + uint_t vmo_refcnt; /* manipulated with atomic ops */ + + /* Fields below are fixed at creation time */ + vm_object_type_t vmo_type; + size_t vmo_size; + void *vmo_data; + uint8_t vmo_attr; +}; + +struct vm_page { + vm_client_t *vmp_client; + list_node_t vmp_node; + vm_page_t *vmp_chain; + uintptr_t vmp_gpa; + pfn_t vmp_pfn; + uint64_t *vmp_ptep; + vm_object_t *vmp_obj_ref; + int vmp_prot; +}; + +#define VMC_IS_ACTIVE(vmc) (((vmc)->vmc_state & VCS_ACTIVE) != 0) + +static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); +static void vmc_space_hold(vm_client_t *); +static void vmc_space_release(vm_client_t *, bool); +static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); +static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); +static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); + + +/* + * Create a new vmspace with a maximum address of `end`. + */ +vmspace_t * +vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty) +{ + vmspace_t *vms; + const uintptr_t size = end + 1; + + /* + * This whole mess is built on the assumption that a 64-bit address + * space is available to work with for the various pagetable tricks. + */ + VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && + size <= (uintptr_t)USERLIMIT); + + vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); + vms->vms_size = size; + list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), + offsetof(vmspace_mapping_t, vmsm_node)); + list_create(&vms->vms_clients, sizeof (vm_client_t), + offsetof(vm_client_t, vmc_node)); + + vms->vms_gpt = vmm_gpt_alloc(pte_ops); + vms->vms_pt_gen = 1; + vms->vms_track_dirty = track_dirty; + + return (vms); +} + +/* + * Destroy a vmspace. All regions in the space must be unmapped. Any remaining + * clients will be orphaned. + */ +void +vmspace_destroy(vmspace_t *vms) +{ + mutex_enter(&vms->vms_lock); + VERIFY(list_is_empty(&vms->vms_maplist)); + + if (!list_is_empty(&vms->vms_clients)) { + vm_client_t *vmc = list_head(&vms->vms_clients); + while (vmc != NULL) { + vmc = vmc_space_orphan(vmc, vms); + } + /* + * Wait for any clients which were in the process of destroying + * themselves to disappear. + */ + while (!list_is_empty(&vms->vms_clients)) { + cv_wait(&vms->vms_cv, &vms->vms_lock); + } + } + VERIFY(list_is_empty(&vms->vms_clients)); + + vmm_gpt_free(vms->vms_gpt); + mutex_exit(&vms->vms_lock); + + mutex_destroy(&vms->vms_lock); + cv_destroy(&vms->vms_cv); + list_destroy(&vms->vms_maplist); + list_destroy(&vms->vms_clients); + + kmem_free(vms, sizeof (*vms)); +} + +/* + * Retrieve the count of resident (mapped into the page tables) pages. + */ +uint64_t +vmspace_resident_count(vmspace_t *vms) +{ + return (vms->vms_pages_mapped); +} + +static pfn_t +vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) +{ + vmmr_region_t *region; + pfn_t pfn; + + ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); + + region = vmo->vmo_data; + pfn = vmmr_region_pfn_at(region, off); + + return (pfn); +} + +static pfn_t +vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) +{ + pfn_t pfn; + + ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); + ASSERT3P(vmo->vmo_data, !=, NULL); + ASSERT3U(off, <, vmo->vmo_size); + + pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; + + return (pfn); +} + +/* + * Allocate a VM object backed by VMM reservoir memory. + */ +vm_object_t * +vm_object_mem_allocate(size_t size, bool transient) +{ + int err; + vmmr_region_t *region = NULL; + vm_object_t *vmo; + + ASSERT3U(size, !=, 0); + ASSERT3U(size & PAGEOFFSET, ==, 0); + + err = vmmr_alloc(size, transient, ®ion); + if (err != 0) { + return (NULL); + } + + vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); + + /* For now, these are to stay fixed after allocation */ + vmo->vmo_type = VMOT_MEM; + vmo->vmo_size = size; + vmo->vmo_attr = MTRR_TYPE_WB; + vmo->vmo_data = region; + vmo->vmo_refcnt = 1; + + return (vmo); +} + +static vm_object_t * +vm_object_mmio_allocate(size_t size, uintptr_t hpa) +{ + vm_object_t *vmo; + + ASSERT3U(size, !=, 0); + ASSERT3U(size & PAGEOFFSET, ==, 0); + ASSERT3U(hpa & PAGEOFFSET, ==, 0); + + vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); + + /* For now, these are to stay fixed after allocation */ + vmo->vmo_type = VMOT_MMIO; + vmo->vmo_size = size; + vmo->vmo_attr = MTRR_TYPE_UC; + vmo->vmo_data = (void *)hpa; + vmo->vmo_refcnt = 1; + + return (vmo); +} + +/* + * Allocate a VM object backed by an existing range of physical memory. + */ +vm_object_t * +vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) +{ + int error; + vm_object_t *obj; + + obj = vm_object_mmio_allocate(len, hpa); + if (obj != NULL) { + error = vmspace_map(vmspace, obj, 0, gpa, len, + PROT_READ | PROT_WRITE); + if (error != 0) { + vm_object_release(obj); + obj = NULL; + } + } + + return (obj); +} + +/* + * Release a vm_object reference + */ +void +vm_object_release(vm_object_t *vmo) +{ + ASSERT(vmo != NULL); + + uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); + /* underflow would be a deadly serious mistake */ + VERIFY3U(ref, !=, UINT_MAX); + if (ref != 0) { + return; + } + + switch (vmo->vmo_type) { + case VMOT_MEM: + vmmr_free((vmmr_region_t *)vmo->vmo_data); + break; + case VMOT_MMIO: + break; + default: + panic("unexpected object type %u", vmo->vmo_type); + break; + } + + vmo->vmo_data = NULL; + vmo->vmo_size = 0; + kmem_free(vmo, sizeof (*vmo)); +} + +/* + * Increase refcount for vm_object reference + */ +void +vm_object_reference(vm_object_t *vmo) +{ + ASSERT(vmo != NULL); + + uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); + /* overflow would be a deadly serious mistake */ + VERIFY3U(ref, !=, 0); +} + +/* + * Get the host-physical PFN for a given offset into a vm_object. + * + * The provided `off` must be within the allocated size of the vm_object. + */ +pfn_t +vm_object_pfn(vm_object_t *vmo, uintptr_t off) +{ + const uintptr_t aligned_off = off & PAGEMASK; + + switch (vmo->vmo_type) { + case VMOT_MEM: + return (vm_object_pager_reservoir(vmo, aligned_off)); + case VMOT_MMIO: + return (vm_object_pager_mmio(vmo, aligned_off)); + case VMOT_NONE: + break; + } + panic("unexpected object type %u", vmo->vmo_type); +} + +static vmspace_mapping_t * +vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) +{ + vmspace_mapping_t *vmsm; + list_t *ml = &vms->vms_maplist; + const uintptr_t range_end = addr + size; + + ASSERT3U(addr, <=, range_end); + + if (addr >= vms->vms_size) { + return (NULL); + } + for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { + const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; + + if (addr >= vmsm->vmsm_addr && addr < seg_end) { + if (range_end <= seg_end) { + return (vmsm); + } else { + return (NULL); + } + } + } + return (NULL); +} + +/* + * Check to see if any mappings reside within [addr, addr + size) span in the + * vmspace, returning true if that span is indeed empty. + */ +static bool +vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) +{ + vmspace_mapping_t *vmsm; + list_t *ml = &vms->vms_maplist; + const uintptr_t range_end = addr + size - 1; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + ASSERT(size > 0); + + for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { + const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; + + /* + * The two ranges do not overlap if the start of either of + * them is after the end of the other. + */ + if (vmsm->vmsm_addr > range_end || addr > seg_end) + continue; + return (false); + } + return (true); +} + +static void +vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) +{ + list_t *ml = &vms->vms_maplist; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + ASSERT(vms->vms_held); + + list_remove(ml, vmsm); + vm_object_release(vmsm->vmsm_object); + kmem_free(vmsm, sizeof (*vmsm)); +} + +/* + * Enter a hold state on the vmspace. This ensures that all VM clients + * associated with the vmspace are excluded from establishing new page holds, + * or any other actions which would require accessing vmspace state subject to + * potential change. + * + * Returns with vmspace_t`vms_lock held. + */ +static void +vmspace_hold_enter(vmspace_t *vms) +{ + mutex_enter(&vms->vms_lock); + VERIFY(!vms->vms_held); + + vm_client_t *vmc = list_head(&vms->vms_clients); + for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { + vmc_space_hold(vmc); + } + vms->vms_held = true; +} + +/* + * Exit a hold state on the vmspace. This releases all VM clients associated + * with the vmspace to be able to establish new page holds, and partake in other + * actions which require accessing changed vmspace state. If `kick_on_cpu` is + * true, then any CPUs actively using the page tables will be IPIed, and the + * call will block until they have acknowledged being ready to use the latest + * state of the tables. + * + * Requires vmspace_t`vms_lock be held, which is released as part of the call. + */ +static void +vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) +{ + ASSERT(MUTEX_HELD(&vms->vms_lock)); + VERIFY(vms->vms_held); + + vm_client_t *vmc = list_head(&vms->vms_clients); + for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { + vmc_space_release(vmc, kick_on_cpu); + } + vms->vms_held = false; + mutex_exit(&vms->vms_lock); +} + +/* + * Attempt to map a vm_object span into the vmspace. + * + * Requirements: + * - `obj_off`, `addr`, and `len` must be page-aligned + * - `obj_off` cannot be greater than the allocated size of the object + * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated + * size of the object + * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address + * of the vmspace + */ +int +vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, + size_t len, uint8_t prot) +{ + vmspace_mapping_t *vmsm; + int res = 0; + + if (len == 0 || (addr + len) < addr || + obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { + return (EINVAL); + } + if ((addr + len) >= vms->vms_size) { + return (ENOMEM); + } + + vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); + + vmspace_hold_enter(vms); + if (!vm_mapping_gap(vms, addr, len)) { + kmem_free(vmsm, sizeof (*vmsm)); + res = ENOMEM; + } else { + vmsm->vmsm_object = vmo; + vmsm->vmsm_addr = addr; + vmsm->vmsm_len = len; + vmsm->vmsm_offset = (off_t)obj_off; + vmsm->vmsm_prot = prot; + list_insert_tail(&vms->vms_maplist, vmsm); + + /* + * Make sure the GPT has tables ready for leaf entries across + * the entire new mapping. + */ + vmm_gpt_populate_region(vms->vms_gpt, addr, addr + len); + } + vmspace_hold_exit(vms, false); + return (res); +} + +/* + * Unmap a region of the vmspace. + * + * Presently the [start, end) span must equal a region previously mapped by a + * call to vmspace_map(). + */ +int +vmspace_unmap(vmspace_t *vms, uintptr_t start, uintptr_t end) +{ + const size_t size = (size_t)(end - start); + vmspace_mapping_t *vmsm; + vm_client_t *vmc; + uint64_t gen = 0; + + ASSERT(start < end); + + vmspace_hold_enter(vms); + /* expect to match existing mapping exactly */ + if ((vmsm = vm_mapping_find(vms, start, size)) == NULL || + vmsm->vmsm_addr != start || vmsm->vmsm_len != size) { + vmspace_hold_exit(vms, false); + return (ENOENT); + } + + /* Prepare clients (and their held pages) for the unmap. */ + for (vmc = list_head(&vms->vms_clients); vmc != NULL; + vmc = list_next(&vms->vms_clients, vmc)) { + vmc_space_unmap(vmc, start, size, vmsm->vmsm_object); + } + + /* Clear all PTEs for region */ + if (vmm_gpt_unmap_region(vms->vms_gpt, start, end) != 0) { + vms->vms_pt_gen++; + gen = vms->vms_pt_gen; + } + /* ... and the intermediate (directory) PTEs as well */ + vmm_gpt_vacate_region(vms->vms_gpt, start, end); + + /* + * If pages were actually unmapped from the GPT, provide clients with + * an invalidation notice. + */ + if (gen != 0) { + for (vmc = list_head(&vms->vms_clients); vmc != NULL; + vmc = list_next(&vms->vms_clients, vmc)) { + vmc_space_invalidate(vmc, start, size, vms->vms_pt_gen); + } + } + + vm_mapping_remove(vms, vmsm); + vmspace_hold_exit(vms, true); + return (0); +} + +static int +vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, + uint64_t **ptepp) +{ + vmm_gpt_t *gpt = vms->vms_gpt; + uint64_t *entries[MAX_GPT_LEVEL], *leaf; + pfn_t pfn = PFN_INVALID; + uint_t prot; + + ASSERT0(gpa & PAGEOFFSET); + ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); + + vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); + leaf = entries[LEVEL1]; + if (leaf == NULL) { + /* + * Since we populated the intermediate tables for any regions + * mapped in the GPT, an empty leaf entry indicates there is no + * mapping, populated or not, at this GPT. + */ + return (FC_NOMAP); + } + + if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) { + if ((req_prot & prot) != req_prot) { + return (FC_PROT); + } + } else { + vmspace_mapping_t *vmsm; + vm_object_t *vmo; + + /* + * Because of the prior leaf check, we should be confident that + * _some_ mapping covers this GPA + */ + vmsm = vm_mapping_find(vms, gpa, PAGESIZE); + VERIFY(vmsm != NULL); + + if ((req_prot & vmsm->vmsm_prot) != req_prot) { + return (FC_PROT); + } + vmo = vmsm->vmsm_object; + pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); + VERIFY(pfn != PFN_INVALID); + + if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot, + vmo->vmo_attr)) { + atomic_inc_64(&vms->vms_pages_mapped); + } + } + + ASSERT(pfn != PFN_INVALID && leaf != NULL); + if (pfnp != NULL) { + *pfnp = pfn; + } + if (ptepp != NULL) { + *ptepp = leaf; + } + return (0); +} + +/* + * Populate (make resident in the page tables) a region of the vmspace. + * + * Presently the [start, end) span must equal a region previously mapped by a + * call to vmspace_map(). + */ +int +vmspace_populate(vmspace_t *vms, uintptr_t start, uintptr_t end) +{ + const size_t size = end - start; + vmspace_mapping_t *vmsm; + + mutex_enter(&vms->vms_lock); + + /* For the time being, only exact-match mappings are expected */ + if ((vmsm = vm_mapping_find(vms, start, size)) == NULL) { + mutex_exit(&vms->vms_lock); + return (FC_NOMAP); + } + + vm_object_t *vmo = vmsm->vmsm_object; + const int prot = vmsm->vmsm_prot; + const uint8_t attr = vmo->vmo_attr; + size_t populated = 0; + for (uintptr_t gpa = start & PAGEMASK; gpa < end; gpa += PAGESIZE) { + const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); + VERIFY(pfn != PFN_INVALID); + + if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) { + populated++; + } + } + atomic_add_64(&vms->vms_pages_mapped, populated); + + mutex_exit(&vms->vms_lock); + return (0); +} + +/* + * Allocate a client from a given vmspace. + */ +vm_client_t * +vmspace_client_alloc(vmspace_t *vms) +{ + vm_client_t *vmc; + + vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); + vmc->vmc_space = vms; + mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); + vmc->vmc_state = VCS_IDLE; + vmc->vmc_cpu_active = -1; + list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), + offsetof(vm_page_t, vmp_node)); + vmc->vmc_track_dirty = vms->vms_track_dirty; + + mutex_enter(&vms->vms_lock); + list_insert_tail(&vms->vms_clients, vmc); + mutex_exit(&vms->vms_lock); + + return (vmc); +} + +/* + * Get the nested page table root pointer (EPTP/NCR3) value. + */ +uint64_t +vmspace_table_root(vmspace_t *vms) +{ + return (vmm_gpt_get_pmtp(vms->vms_gpt)); +} + +/* + * Get the current generation number of the nested page table. + */ +uint64_t +vmspace_table_gen(vmspace_t *vms) +{ + return (vms->vms_pt_gen); +} + +/* + * Mark a vm_client as active. This will block if/while the client is held by + * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will + * fail if the vm_client has been orphaned. + */ +static int +vmc_activate(vm_client_t *vmc) +{ + mutex_enter(&vmc->vmc_lock); + VERIFY0(vmc->vmc_state & VCS_ACTIVE); + if ((vmc->vmc_state & VCS_ORPHANED) != 0) { + return (ENXIO); + } + while ((vmc->vmc_state & VCS_HOLD) != 0) { + cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); + } + vmc->vmc_state |= VCS_ACTIVE; + return (0); +} + +/* + * Mark a vm_client as no longer active. It must be called with + * vm_client_t`vmc_lock already held, and will return with it released. + */ +static void +vmc_deactivate(vm_client_t *vmc) +{ + ASSERT(MUTEX_HELD(&vmc->vmc_lock)); + VERIFY(vmc->vmc_state & VCS_ACTIVE); + + vmc->vmc_state ^= VCS_ACTIVE; + if ((vmc->vmc_state & VCS_HOLD) != 0) { + cv_broadcast(&vmc->vmc_cv); + } + mutex_exit(&vmc->vmc_lock); +} + +/* + * Indicate that a CPU will be utilizing the nested page tables through this VM + * client. Interrupts (and/or the GIF) are expected to be disabled when calling + * this function. Returns the generation number of the nested page table (to be + * used for TLB invalidations). + */ +uint64_t +vmc_table_enter(vm_client_t *vmc) +{ + vmspace_t *vms = vmc->vmc_space; + uint64_t gen; + + ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); + ASSERT3S(vmc->vmc_cpu_active, ==, -1); + + /* + * Since the NPT activation occurs with interrupts disabled, this must + * be done without taking vmc_lock like normal. + */ + gen = vms->vms_pt_gen; + vmc->vmc_cpu_active = CPU->cpu_id; + vmc->vmc_cpu_gen = gen; + atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); + + return (gen); +} + +/* + * Indicate that this VM client is not longer (directly) using the underlying + * page tables. Interrupts (and/or the GIF) must be enabled prior to calling + * this function. + */ +void +vmc_table_exit(vm_client_t *vmc) +{ + mutex_enter(&vmc->vmc_lock); + + ASSERT(vmc->vmc_state & VCS_ON_CPU); + vmc->vmc_state ^= VCS_ON_CPU; + vmc->vmc_cpu_active = -1; + if ((vmc->vmc_state & VCS_HOLD) != 0) { + cv_broadcast(&vmc->vmc_cv); + } + + mutex_exit(&vmc->vmc_lock); +} + +static void +vmc_space_hold(vm_client_t *vmc) +{ + mutex_enter(&vmc->vmc_lock); + VERIFY0(vmc->vmc_state & VCS_HOLD); + + /* + * Because vmc_table_enter() alters vmc_state from a context where + * interrupts are disabled, it cannot pay heed to vmc_lock, so setting + * VMC_HOLD must be done atomically here. + */ + atomic_or_uint(&vmc->vmc_state, VCS_HOLD); + + /* Wait for client to go inactive */ + while ((vmc->vmc_state & VCS_ACTIVE) != 0) { + cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); + } + mutex_exit(&vmc->vmc_lock); +} + +static void +vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) +{ + mutex_enter(&vmc->vmc_lock); + VERIFY(vmc->vmc_state & VCS_HOLD); + + if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { + poke_cpu(vmc->vmc_cpu_active); + + while ((vmc->vmc_state & VCS_ON_CPU) != 0) { + cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); + } + } + + /* + * Because vmc_table_enter() alters vmc_state from a context where + * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing + * VMC_HOLD must be done atomically here. + */ + atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); + mutex_exit(&vmc->vmc_lock); +} + +static void +vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, + uint64_t gen) +{ + mutex_enter(&vmc->vmc_lock); + VERIFY(vmc->vmc_state & VCS_HOLD); + if ((vmc->vmc_state & VCS_ON_CPU) != 0) { + /* + * Wait for clients using an old generation of the page tables + * to exit guest context, where they subsequently flush the TLB + * for the new generation. + */ + if (vmc->vmc_cpu_gen < gen) { + poke_cpu(vmc->vmc_cpu_active); + + while ((vmc->vmc_state & VCS_ON_CPU) != 0) { + cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); + } + } + } + if (vmc->vmc_inval_func != NULL) { + vmc_inval_cb_t func = vmc->vmc_inval_func; + void *data = vmc->vmc_inval_data; + + /* + * Perform the actual invalidation call outside vmc_lock to + * avoid lock ordering issues in the consumer. Since the client + * is under VCS_HOLD, this is safe. + */ + mutex_exit(&vmc->vmc_lock); + func(data, addr, size); + mutex_enter(&vmc->vmc_lock); + } + mutex_exit(&vmc->vmc_lock); +} + +static void +vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, + vm_object_t *vmo) +{ + mutex_enter(&vmc->vmc_lock); + VERIFY(vmc->vmc_state & VCS_HOLD); + + /* + * With the current vCPU exclusion invariants in place, we do not expect + * a vCPU to be in guest context during an unmap. + */ + VERIFY0(vmc->vmc_state & VCS_ON_CPU); + + /* + * Any holds against the unmapped region need to establish their own + * reference to the underlying object to avoid a potential + * use-after-free. + */ + for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); + vmp != NULL; + vmp = list_next(&vmc->vmc_held_pages, vmc)) { + if (vmp->vmp_gpa < addr || + vmp->vmp_gpa >= (addr + size)) { + /* Hold outside region in question */ + continue; + } + if (vmp->vmp_obj_ref == NULL) { + vm_object_reference(vmo); + vmp->vmp_obj_ref = vmo; + /* For an unmapped region, PTE is now meaningless */ + vmp->vmp_ptep = NULL; + } else { + /* + * Object could have gone through cycle of + * unmap-map-unmap before the hold was released. + */ + VERIFY3P(vmp->vmp_ptep, ==, NULL); + } + } + mutex_exit(&vmc->vmc_lock); +} + +static vm_client_t * +vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) +{ + vm_client_t *next; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + + mutex_enter(&vmc->vmc_lock); + VERIFY3P(vmc->vmc_space, ==, vms); + VERIFY0(vmc->vmc_state & VCS_ORPHANED); + if (vmc->vmc_state & VCS_DESTROY) { + /* + * This vm_client is currently undergoing destruction, so it + * does not need to be orphaned. Let it proceed with its own + * clean-up task. + */ + next = list_next(&vms->vms_clients, vmc); + } else { + /* + * Clients are only orphaned when the containing vmspace is + * being torn down. All mappings from the vmspace should + * already be gone, meaning any remaining held pages should have + * direct references to the object. + */ + for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); + vmp != NULL; + vmp = list_next(&vmc->vmc_held_pages, vmp)) { + ASSERT3P(vmp->vmp_ptep, ==, NULL); + ASSERT3P(vmp->vmp_obj_ref, !=, NULL); + } + + /* + * After this point, the client will be orphaned, unable to + * establish new page holds (or access any vmspace-related + * resources) and is in charge of cleaning up after itself. + */ + vmc->vmc_state |= VCS_ORPHANED; + next = list_next(&vms->vms_clients, vmc); + list_remove(&vms->vms_clients, vmc); + vmc->vmc_space = NULL; + } + mutex_exit(&vmc->vmc_lock); + return (next); +} + +/* + * Attempt to hold a page at `gpa` inside the referenced vmspace. + */ +vm_page_t * +vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) +{ + vmspace_t *vms = vmc->vmc_space; + vm_page_t *vmp; + pfn_t pfn = PFN_INVALID; + uint64_t *ptep = NULL; + + ASSERT0(gpa & PAGEOFFSET); + ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); + + vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); + if (vmc_activate(vmc) != 0) { + kmem_free(vmp, sizeof (*vmp)); + return (NULL); + } + + if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { + vmc_deactivate(vmc); + kmem_free(vmp, sizeof (*vmp)); + return (NULL); + } + ASSERT(pfn != PFN_INVALID && ptep != NULL); + + vmp->vmp_client = vmc; + vmp->vmp_chain = NULL; + vmp->vmp_gpa = gpa; + vmp->vmp_pfn = pfn; + vmp->vmp_ptep = ptep; + vmp->vmp_obj_ref = NULL; + vmp->vmp_prot = prot; + list_insert_tail(&vmc->vmc_held_pages, vmp); + vmc_deactivate(vmc); + + return (vmp); +} + +int +vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) +{ + vmspace_t *vms = vmc->vmc_space; + int err; + + err = vmc_activate(vmc); + if (err == 0) { + err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); + vmc_deactivate(vmc); + } + + return (err); +} + +/* + * Allocate an additional vm_client_t, based on an existing one. Only the + * associatation with the vmspace is cloned, not existing holds or any + * configured invalidation function. + */ +vm_client_t * +vmc_clone(vm_client_t *vmc) +{ + vmspace_t *vms = vmc->vmc_space; + + return (vmspace_client_alloc(vms)); +} + +/* + * Register a function (and associated data pointer) to be called when an + * address range in the vmspace is invalidated. + */ +int +vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) +{ + int err; + + err = vmc_activate(vmc); + if (err == 0) { + vmc->vmc_inval_func = func; + vmc->vmc_inval_data = data; + vmc_deactivate(vmc); + } + + return (err); +} + +/* + * Destroy a vm_client_t instance. + * + * No pages held through this vm_client_t may be outstanding when performing a + * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to + * vmc_table_exit() has been made). + */ +void +vmc_destroy(vm_client_t *vmc) +{ + mutex_enter(&vmc->vmc_lock); + + VERIFY(list_is_empty(&vmc->vmc_held_pages)); + VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); + + if ((vmc->vmc_state & VCS_ORPHANED) == 0) { + vmspace_t *vms; + + /* + * Deassociation with the parent vmspace must be done carefully: + * The vmspace could attempt to orphan this vm_client while we + * release vmc_lock in order to take vms_lock (the required + * order). The client is marked to indicate that destruction is + * under way. Doing so prevents any racing orphan operation + * from applying to this client, allowing us to deassociate from + * the vmspace safely. + */ + vmc->vmc_state |= VCS_DESTROY; + vms = vmc->vmc_space; + mutex_exit(&vmc->vmc_lock); + + mutex_enter(&vms->vms_lock); + mutex_enter(&vmc->vmc_lock); + list_remove(&vms->vms_clients, vmc); + /* + * If the vmspace began its own destruction operation while we + * were navigating the locks, be sure to notify it about this + * vm_client being deassociated. + */ + cv_signal(&vms->vms_cv); + mutex_exit(&vmc->vmc_lock); + mutex_exit(&vms->vms_lock); + } else { + VERIFY3P(vmc->vmc_space, ==, NULL); + mutex_exit(&vmc->vmc_lock); + } + + mutex_destroy(&vmc->vmc_lock); + cv_destroy(&vmc->vmc_cv); + list_destroy(&vmc->vmc_held_pages); + + kmem_free(vmc, sizeof (*vmc)); +} + +static __inline void * +vmp_ptr(const vm_page_t *vmp) +{ + ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); + + const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); + return ((void *)((uintptr_t)kpm_vbase + paddr)); +} + +/* + * Get a readable kernel-virtual pointer for a held page. + * + * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() + * call to acquire this page reference. + */ +const void * +vmp_get_readable(const vm_page_t *vmp) +{ + ASSERT(vmp->vmp_prot & PROT_READ); + + return (vmp_ptr(vmp)); +} + +/* + * Get a writable kernel-virtual pointer for a held page. + * + * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() + * call to acquire this page reference. + */ +void * +vmp_get_writable(const vm_page_t *vmp) +{ + ASSERT(vmp->vmp_prot & PROT_WRITE); + + return (vmp_ptr(vmp)); +} + +/* + * Get the host-physical PFN for a held page. + */ +pfn_t +vmp_get_pfn(const vm_page_t *vmp) +{ + return (vmp->vmp_pfn); +} + +/* + * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. + */ +void +vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) +{ + ASSERT3P(vmp->vmp_chain, ==, NULL); + + vmp->vmp_chain = to_chain; +} + +/* + * Retrieve the pointer from the page-chaining in `vmp`. + */ +vm_page_t * +vmp_next(const vm_page_t *vmp) +{ + return (vmp->vmp_chain); +} + +static __inline bool +vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) +{ + ASSERT(MUTEX_HELD(&vmc->vmc_lock)); + + bool was_unmapped = false; + + list_remove(&vmc->vmc_held_pages, vmp); + if (vmp->vmp_obj_ref != NULL) { + ASSERT3P(vmp->vmp_ptep, ==, NULL); + + vm_object_release(vmp->vmp_obj_ref); + was_unmapped = true; + } else { + ASSERT3P(vmp->vmp_ptep, !=, NULL); + + if ((vmp->vmp_prot & PROT_WRITE) != 0 && vmc->vmc_track_dirty) { + vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt; + vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true); + } + } + kmem_free(vmp, sizeof (*vmp)); + return (was_unmapped); +} + +/* + * Release held page. Returns true if page resided on region which was + * subsequently unmapped. + */ +bool +vmp_release(vm_page_t *vmp) +{ + vm_client_t *vmc = vmp->vmp_client; + + VERIFY(vmc != NULL); + + mutex_enter(&vmc->vmc_lock); + const bool was_unmapped = vmp_release_inner(vmp, vmc); + mutex_exit(&vmc->vmc_lock); + return (was_unmapped); +} + +/* + * Release a chain of pages which were associated via vmp_chain() (setting + * page-chaining pointer). Returns true if any pages resided upon a region + * which was subsequently unmapped. + * + * All of those pages must have been held through the same vm_client_t. + */ +bool +vmp_release_chain(vm_page_t *vmp) +{ + vm_client_t *vmc = vmp->vmp_client; + bool any_unmapped = false; + + ASSERT(vmp != NULL); + + mutex_enter(&vmc->vmc_lock); + while (vmp != NULL) { + vm_page_t *next = vmp->vmp_chain; + + /* We expect all pages in chain to be from same client */ + ASSERT3P(vmp->vmp_client, ==, vmc); + + if (vmp_release_inner(vmp, vmc)) { + any_unmapped = true; + } + vmp = next; + } + mutex_exit(&vmc->vmc_lock); + return (any_unmapped); +} + + +int +vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, + struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) +{ + vm_object_t *vmo; + int err; + + if (segoff < 0 || len <= 0 || + (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { + return (EINVAL); + } + if ((prot & PROT_USER) == 0) { + return (ENOTSUP); + } + err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); + if (err != 0) { + return (err); + } + + VERIFY(segoff >= 0); + VERIFY(len <= vmo->vmo_size); + VERIFY((len + segoff) <= vmo->vmo_size); + + if (vmo->vmo_type != VMOT_MEM) { + /* Only support memory objects for now */ + return (ENOTSUP); + } + + as_rangelock(as); + + err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); + if (err == 0) { + segvmm_crargs_t svma; + + svma.prot = prot; + svma.offset = segoff; + svma.vmo = vmo; + svma.vmc = NULL; + + err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); + } + + as_rangeunlock(as); + return (err); +} + +int +vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, + off_t len, uint_t prot, uint_t maxprot, uint_t flags) +{ + + const uintptr_t gpa = (uintptr_t)off; + const size_t size = (uintptr_t)len; + int err; + + if (off < 0 || len <= 0 || + (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { + return (EINVAL); + } + if ((prot & PROT_USER) == 0) { + return (ENOTSUP); + } + + as_rangelock(as); + + err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); + if (err == 0) { + segvmm_crargs_t svma; + + svma.prot = prot; + svma.offset = gpa; + svma.vmo = NULL; + svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); + + err = as_map(as, *addrp, len, segvmm_create, &svma); + } + + as_rangeunlock(as); + return (err); +} diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h index 31b63dfe69..f1241a9183 100644 --- a/usr/src/uts/intel/sys/x86_archext.h +++ b/usr/src/uts/intel/sys/x86_archext.h @@ -526,9 +526,21 @@ extern "C" { #define IA32_VMX_PROCBASED2_VPID (1UL << 5) #define MSR_IA32_VMX_EPT_VPID_CAP 0x48c -#define IA32_VMX_EPT_VPID_INVEPT (1UL << 20) -#define IA32_VMX_EPT_VPID_INVEPT_SINGLE (1UL << 25) -#define IA32_VMX_EPT_VPID_INVEPT_ALL (1UL << 26) +#define IA32_VMX_EPT_VPID_EXEC_ONLY (1UL << 0) +#define IA32_VMX_EPT_VPID_PWL4 (1UL << 6) +#define IA32_VMX_EPT_VPID_TYPE_UC (1UL << 8) +#define IA32_VMX_EPT_VPID_TYPE_WB (1UL << 14) +#define IA32_VMX_EPT_VPID_MAP_2M (1UL << 16) +#define IA32_VMX_EPT_VPID_MAP_1G (1UL << 17) +#define IA32_VMX_EPT_VPID_HW_AD (1UL << 21) +#define IA32_VMX_EPT_VPID_INVEPT (1UL << 20) +#define IA32_VMX_EPT_VPID_INVEPT_SINGLE (1UL << 25) +#define IA32_VMX_EPT_VPID_INVEPT_ALL (1UL << 26) +#define IA32_VMX_EPT_VPID_INVVPID (1UL << 32) +#define IA32_VMX_EPT_VPID_INVVPID_ADDR (1UL << 40) +#define IA32_VMX_EPT_VPID_INVVPID_SINGLE (1UL << 41) +#define IA32_VMX_EPT_VPID_INVVPID_ALL (1UL << 42) +#define IA32_VMX_EPT_VPID_INVVPID_RETAIN (1UL << 43) /* * Intel TSX Control MSRs |