diff options
| author | Dan McDonald <danmcd@joyent.com> | 2021-07-27 01:17:26 -0400 |
|---|---|---|
| committer | Dan McDonald <danmcd@joyent.com> | 2021-07-27 01:17:26 -0400 |
| commit | 359c7fef8f9958b51fa2670e2e52e3d3c1eb9bf4 (patch) | |
| tree | 2aedc787e8af163a45c617e337cf0538e8268289 /usr/src | |
| parent | bf777172e611181d7a2838b4e6b59d72913aa3ff (diff) | |
| parent | b4ceea05088ba1b5fae1914544a808623516aa80 (diff) | |
| download | illumos-joyent-359c7fef8f9958b51fa2670e2e52e3d3c1eb9bf4.tar.gz | |
[illumos-gate merge]
commit b4ceea05088ba1b5fae1914544a808623516aa80
13932 improve bhyve second level page table support
13862 EPT/RVI supports resetting A/D bits
Diffstat (limited to 'usr/src')
| -rw-r--r-- | usr/src/uts/i86pc/Makefile.files | 3 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h | 104 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h | 12 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm_gpt.c | 558 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c | 321 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c | 369 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c | 1 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/os/gipt.c | 568 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/gipt.h | 92 |
9 files changed, 965 insertions, 1063 deletions
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index fd74b0047f..4370e90d9a 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -26,6 +26,7 @@ # Copyright 2019 Joyent, Inc. # Copyright 2019 OmniOS Community Edition (OmniOSce) Association. # Copyright 2019 Joyent, Inc. +# Copyright 2021 Oxide Computer Company # # This Makefile defines file modules in the directory uts/i86pc # and its children. These are the source files which are i86pc @@ -269,7 +270,7 @@ VMM_OBJS += vmm.o \ vmcb.o \ svm_support.o \ amdv.o \ - gipt.o \ + vmm_gpt.o \ vmm_sol_vm.o \ vmm_sol_glue.o \ vmm_sol_ept.o \ diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h new file mode 100644 index 0000000000..554f51bbb6 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h @@ -0,0 +1,104 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VMM_GPT_H +#define _VMM_GPT_H + +#include <sys/types.h> + +typedef struct vmm_pt_ops vmm_pt_ops_t; +struct vmm_pt_ops { + void * (*vpo_init)(uint64_t *); + void (*vpo_free)(void *); + uint64_t (*vpo_wired_cnt)(void *); + int (*vpo_is_wired)(void *, uint64_t, uint_t *); + int (*vpo_map)(void *, uint64_t, pfn_t, uint_t, uint_t, + uint8_t); + uint64_t (*vpo_unmap)(void *, uint64_t, uint64_t); +}; + +extern struct vmm_pt_ops ept_ops; +extern struct vmm_pt_ops rvi_ops; + +/* + * Constants for the nodes in the GPT radix tree. Note + * that, in accordance with hardware page table descriptions, + * the root of the tree is referred to as "LEVEL4" while the + * leaf level is "LEVEL1". + */ +enum vmm_gpt_node_level { + LEVEL4 = 0, + LEVEL3, + LEVEL2, + LEVEL1, + MAX_GPT_LEVEL, +}; + +/* + * The vmm_pte_ops structure contains function pointers for format-specific + * operations on page table entries. The operations are as follows: + * + * vpeo_map_table: Creates a PTE that maps an inner node in the page table. + * vpeo_map_page: Creates a leaf entry PTE that maps a page of physical memory. + * vpeo_pte_pfn: Returns the PFN contained in the given PTE. + * vpeo_pte_is_present: Returns true IFF the PTE maps a present page. + * vpeo_pte_prot: Returns a bitmask of protection bits for the PTE. + * The bits correspond to the standard mmap(2) bits: PROT_READ, PROT_WRITE, + * PROT_EXEC. + * vpeo_reset_dirty: Resets the dirty bit on the given PTE. If the second + * argument is `true`, the bit will be set, otherwise it will be cleared. + * Returns non-zero if the previous value of the bit was set. + * vpeo_reset_accessed: Resets the accessed bit on the given PTE. If the + * second argument is `true`, the bit will be set, otherwise it will be + * cleared. Returns non-zero if the previous value of the bit was set. + */ +typedef struct vmm_pte_ops vmm_pte_ops_t; +struct vmm_pte_ops { + uint64_t (*vpeo_map_table)(pfn_t); + uint64_t (*vpeo_map_page)(pfn_t, uint_t, uint8_t); + pfn_t (*vpeo_pte_pfn)(uint64_t); + bool (*vpeo_pte_is_present)(uint64_t); + uint_t (*vpeo_pte_prot)(uint64_t); + uint_t (*vpeo_reset_dirty)(uint64_t *, bool); + uint_t (*vpeo_reset_accessed)(uint64_t *, bool); +}; + +struct vmm_gpt; +typedef struct vmm_gpt vmm_gpt_t; + +vmm_gpt_t *ept_create(void); +vmm_gpt_t *rvi_create(void); + +vmm_gpt_t *vmm_gpt_alloc(vmm_pte_ops_t *); +void vmm_gpt_free(vmm_gpt_t *); + +void *vmm_gpt_root_kaddr(vmm_gpt_t *); +pfn_t vmm_gpt_root_pfn(vmm_gpt_t *); +uint64_t *vmm_gpt_lookup(vmm_gpt_t *, uint64_t); +void vmm_gpt_walk(vmm_gpt_t *, uint64_t, uint64_t **, enum vmm_gpt_node_level); +void vmm_gpt_populate_entry(vmm_gpt_t *, uint64_t); +void vmm_gpt_populate_region(vmm_gpt_t *, uint64_t, uint64_t); +void vmm_gpt_vacate_region(vmm_gpt_t *, uint64_t, uint64_t); +bool vmm_gpt_map(vmm_gpt_t *, uint64_t, pfn_t, uint_t, uint8_t); +bool vmm_gpt_unmap(vmm_gpt_t *, uint64_t); +size_t vmm_gpt_unmap_region(vmm_gpt_t *, uint64_t, uint64_t); + +bool vmm_gpt_is_mapped(vmm_gpt_t *, uint64_t, uint_t *); +size_t vmm_gpt_mapped_count(vmm_gpt_t *); +uint_t vmm_gpt_reset_accessed(vmm_gpt_t *, uint64_t *, bool); +uint_t vmm_gpt_reset_dirty(vmm_gpt_t *, uint64_t *, bool); + +#endif /* _VMM_GPT_H */ diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h index 43249a6ac7..6c7f9d423e 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h @@ -163,18 +163,6 @@ void *vmspace_find_kva(struct vmspace *, uintptr_t, size_t); void vmm_arena_init(void); void vmm_arena_fini(void); -struct vmm_pt_ops { - void * (*vpo_init)(uint64_t *); - void (*vpo_free)(void *); - uint64_t (*vpo_wired_cnt)(void *); - int (*vpo_is_wired)(void *, uint64_t, uint_t *); - int (*vpo_map)(void *, uint64_t, pfn_t, uint_t, uint_t, uint8_t); - uint64_t (*vpo_unmap)(void *, uint64_t, uint64_t); -}; - -extern struct vmm_pt_ops ept_ops; -extern struct vmm_pt_ops rvi_ops; - typedef int (*pmap_pinit_t)(struct pmap *pmap); struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_gpt.c b/usr/src/uts/i86pc/io/vmm/vmm_gpt.c new file mode 100644 index 0000000000..6624e0fa6d --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_gpt.c @@ -0,0 +1,558 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/atomic.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/sunddi.h> +#include <sys/panic.h> +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/hat_i86.h> + +#include <sys/vmm_gpt.h> + +/* + * VMM Generic Page Tables + * + * Bhyve runs on AMD and Intel hosts and both support nested page tables + * describing the guest's physical address space. But the two use different and + * mutually incompatible page table formats: Intel uses the EPT, which is based + * on the Itanium page table format, while AMD uses the nPT, which is based on + * the x86_64 page table format. + * + * The GPT abstracts these format differences, and provides a single interface + * for interacting with either kind of table structure. + * + * At a high-level, the GPT is a tree that mirrors the paging table radix tree. + * It is parameterized with operations on PTEs that are specific to the table + * type (EPT or nPT) and also keeps track of how many pages the table maps, as + * well as a pointer to the root node in the tree. + * + * A node in the GPT keep pointers to its parent (NULL for the root), its + * left-most child, and its rightward siblings. The node understands its + * position in the tree in terms of its level it appears at and the index it + * occupies at its parent's level, as well as how many children it has. It also + * owns the physical memory page for the hardware page table entries that map + * its children. Thus, for a node at any given level in the tree, the nested + * PTE for that node's child at index $i$ is the i'th uint64_t in that node's + * entry page and the entry page is part of the paging structure consumed by + * hardware. + * + * The GPT interface provides functions for populating and vacating the tree for + * regions in the guest physical address space, and for mapping and unmapping + * pages in populated regions. Users must populate a region before mapping + * pages into it, and must unmap pages before vacating the region. + * + * The interface also exposes a function for walking the table from the root to + * a leaf entry, populating an array of pointers to PTEs. This walk uses the + * hardware page structure itself, and is thus fast, though as a result it + * potentially aliases entries; caveat emptor. The walk primitive is used for + * mapping, unmapping, and lookups. + * + * Format-specific differences are abstracted by parameterizing the GPT with a + * set of PTE operations specific to the platform. The GPT code makes use of + * these when mapping or populating entries, resetting accessed and dirty bits + * on entries, and similar operations. + */ + +/* + * A GPT node. + * + * Each node contains pointers to its parent, its left-most child, and its + * rightward siblings. Interior nodes also maintain a reference count, and + * each node contains its level and index in its parent's table. Finally, + * each node contains the host PFN of the page that it links into the page + * table, as well as a kernel pointer to table. + * + * Note, this is carefully sized to fit exactly into a 64-byte cache line. + */ +typedef struct vmm_gpt_node vmm_gpt_node_t; +struct vmm_gpt_node { + uint64_t vgn_host_pfn; + uint16_t vgn_level; + uint16_t vgn_index; + uint32_t vgn_ref_cnt; + vmm_gpt_node_t *vgn_parent; + vmm_gpt_node_t *vgn_children; + vmm_gpt_node_t *vgn_siblings; + uint64_t *vgn_entries; + uint64_t _vgn_pad[2]; +}; + +/* + * A VMM Generic Page Table. + * + * The generic page table is a format-agnostic, 4-level paging structure + * modeling a second-level page table (EPT on Intel, nPT on AMD). It + * contains a counter of pages the table maps, a pointer to the root node + * in the table, and is parameterized with a set of PTE operations specific + * to the table type. + */ +struct vmm_gpt { + vmm_gpt_node_t *vgpt_root; + vmm_pte_ops_t *vgpt_pte_ops; + uint64_t vgpt_mapped_page_count; +}; + +/* + * VMM Guest Page Tables + */ + +/* + * Allocates a vmm_gpt_node_t structure with corresponding page of memory to + * hold the PTEs it contains. + */ +static vmm_gpt_node_t * +vmm_gpt_node_alloc(void) +{ + vmm_gpt_node_t *node; + caddr_t page; + + node = kmem_zalloc(sizeof (*node), KM_SLEEP); + /* + * Note: despite the man page, allocating PAGESIZE bytes is + * guaranteed to be page-aligned. + */ + page = kmem_zalloc(PAGESIZE, KM_SLEEP); + node->vgn_entries = (uint64_t *)page; + node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page); + + return (node); +} + +/* + * Allocates and initializes a vmm_gpt_t. + */ +vmm_gpt_t * +vmm_gpt_alloc(vmm_pte_ops_t *pte_ops) +{ + vmm_gpt_t *gpt; + + VERIFY(pte_ops != NULL); + gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP); + gpt->vgpt_pte_ops = pte_ops; + gpt->vgpt_root = vmm_gpt_node_alloc(); + + return (gpt); +} + +/* + * Retrieves the host kernel address of the GPT root. + */ +void * +vmm_gpt_root_kaddr(vmm_gpt_t *gpt) +{ + return (gpt->vgpt_root->vgn_entries); +} + +/* + * Retrieves the host PFN of the GPT root. + */ +uint64_t +vmm_gpt_root_pfn(vmm_gpt_t *gpt) +{ + return (gpt->vgpt_root->vgn_host_pfn); +} + +/* + * Frees the given node, first nulling out all of its links to other nodes in + * the tree, adjusting its parents reference count, and unlinking itself from + * its parents page table. + */ +static void +vmm_gpt_node_free(vmm_gpt_node_t *node) +{ + ASSERT(node != NULL); + ASSERT3U(node->vgn_ref_cnt, ==, 0); + ASSERT(node->vgn_host_pfn != PFN_INVALID); + ASSERT(node->vgn_entries != NULL); + if (node->vgn_parent != NULL) { + uint64_t *parent_entries = node->vgn_parent->vgn_entries; + parent_entries[node->vgn_index] = 0; + node->vgn_parent->vgn_ref_cnt--; + } + kmem_free(node->vgn_entries, PAGESIZE); + kmem_free(node, sizeof (*node)); +} + +/* + * Frees the portion of the radix tree rooted at the given node. + */ +static void +vmm_gpt_node_tree_free(vmm_gpt_node_t *node) +{ + ASSERT(node != NULL); + + for (vmm_gpt_node_t *child = node->vgn_children, *next = NULL; + child != NULL; + child = next) { + next = child->vgn_siblings; + vmm_gpt_node_tree_free(child); + } + vmm_gpt_node_free(node); +} + +/* + * Cleans up a vmm_gpt_t by removing any lingering vmm_gpt_node_t entries + * it refers to. + */ +void +vmm_gpt_free(vmm_gpt_t *gpt) +{ + vmm_gpt_node_tree_free(gpt->vgpt_root); + kmem_free(gpt, sizeof (*gpt)); +} + +/* + * Return the index in the paging structure for the given level. + */ +static inline uint16_t +vmm_gpt_node_index(uint64_t gpa, enum vmm_gpt_node_level level) +{ + const int SHIFTS[MAX_GPT_LEVEL] = { 39, 30, 21, 12 }; + const uint_t MASK = (1U << 9) - 1; + ASSERT(level < MAX_GPT_LEVEL); + return ((gpa >> SHIFTS[level]) & MASK); +} + +/* + * Finds the child for the given GPA in the given parent node. + * Returns a pointer to node, or NULL if it is not found. + */ +static vmm_gpt_node_t * +vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa) +{ + if (parent == NULL) + return (NULL); + + const uint16_t index = vmm_gpt_node_index(gpa, parent->vgn_level); + for (vmm_gpt_node_t *child = parent->vgn_children; + child != NULL && child->vgn_index <= index; + child = child->vgn_siblings) { + if (child->vgn_index == index) + return (child); + } + + return (NULL); +} + +/* + * Walks the GPT for the given GPA, accumulating entries to the given depth. If + * the walk terminates before the depth is reached, the remaining entries are + * written with NULLs. + */ +void +vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries, + enum vmm_gpt_node_level depth) +{ + uint64_t *current_entries, entry; + pfn_t pfn; + + ASSERT(gpt != NULL); + current_entries = gpt->vgpt_root->vgn_entries; + for (uint_t i = 0; i < depth; i++) { + if (current_entries == NULL) { + entries[i] = NULL; + continue; + } + entries[i] = ¤t_entries[vmm_gpt_node_index(gpa, i)]; + entry = *entries[i]; + if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { + current_entries = NULL; + continue; + } + pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); + current_entries = (uint64_t *)hat_kpm_pfn2va(pfn); + } +} + +/* + * Looks up an entry given GPA. + */ +uint64_t * +vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa) +{ + uint64_t *entries[MAX_GPT_LEVEL]; + + vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); + + return (entries[LEVEL1]); +} + +/* + * Adds a node for the given GPA to the GPT as a child of the given parent. + */ +static void +vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child, + uint64_t gpa) +{ + vmm_gpt_node_t **prevp; + vmm_gpt_node_t *node; + uint64_t *parent_entries, entry; + + ASSERT(gpt != NULL); + ASSERT(gpt->vgpt_pte_ops != NULL); + ASSERT(parent != NULL); + ASSERT(child != NULL); + + const int index = vmm_gpt_node_index(gpa, parent->vgn_level); + child->vgn_index = index; + child->vgn_level = parent->vgn_level + 1; + child->vgn_parent = parent; + parent_entries = parent->vgn_entries; + entry = gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn); + parent_entries[index] = entry; + + for (prevp = &parent->vgn_children, node = parent->vgn_children; + node != NULL; + prevp = &node->vgn_siblings, node = node->vgn_siblings) { + if (node->vgn_index > child->vgn_index) { + break; + } + } + if (node != NULL) + ASSERT3U(node->vgn_index, !=, child->vgn_index); + child->vgn_siblings = node; + *prevp = child; + parent->vgn_ref_cnt++; +} + +/* + * Populate the GPT with nodes so that a entries for the given GPA exist. Note + * that this does not actually map the entry, but simply ensures that the + * entries exist. + */ +void +vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa) +{ + vmm_gpt_node_t *node, *child; + + ASSERT(gpt != NULL); + node = gpt->vgpt_root; + for (uint_t i = 0; i < LEVEL1; i++) { + ASSERT(node != NULL); + child = vmm_gpt_node_find_child(node, gpa); + if (child == NULL) { + child = vmm_gpt_node_alloc(); + ASSERT(child != NULL); + vmm_gpt_add_child(gpt, node, child, gpa); + } + node = child; + } +} + +/* + * Ensures that PTEs for the region of address space bounded by + * [start, end] exist in the tree. + */ +void +vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) +{ + for (uint64_t page = start; page <= end; page += PAGESIZE) { + vmm_gpt_populate_entry(gpt, page); + } +} + +/* + * Inserts an entry for a given GPA into the table. The caller must + * ensure that the entry is not currently mapped, though note that this + * can race with another thread inserting the same page into the tree. + * If we lose the race, we ensure that the page we thought we were + * inserting is the page that was inserted. + */ +bool +vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr) +{ + uint64_t *entries[MAX_GPT_LEVEL], entry, old_entry; + + ASSERT(gpt != NULL); + vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); + ASSERT(entries[LEVEL1] != NULL); + + entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr); + old_entry = atomic_cas_64(entries[LEVEL1], 0, entry); + if (old_entry != 0) { + ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), + ==, + gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry)); + return (false); + } + gpt->vgpt_mapped_page_count++; + + return (true); +} + +/* + * Removes a child node from its parent's list of children, and then frees + * the now-orphaned child. + */ +static void +vmm_gpt_node_remove_child(vmm_gpt_node_t *parent, vmm_gpt_node_t *child) +{ + ASSERT(parent != NULL); + + ASSERT3P(child->vgn_children, ==, NULL); + vmm_gpt_node_t **prevp = &parent->vgn_children; + for (vmm_gpt_node_t *node = parent->vgn_children; + node != NULL; + prevp = &node->vgn_siblings, node = node->vgn_siblings) { + if (node == child) { + *prevp = node->vgn_siblings; + vmm_gpt_node_free(node); + break; + } + } +} + +/* + * Cleans up unused inner nodes in the GPT. Asserts that the + * leaf corresponding to the entry does not map any additional + * pages. + */ +static void +vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa) +{ + vmm_gpt_node_t *nodes[MAX_GPT_LEVEL], *node; + + node = gpt->vgpt_root; + for (uint_t i = 0; i < MAX_GPT_LEVEL; i++) { + nodes[i] = node; + node = vmm_gpt_node_find_child(node, gpa); + } + if (nodes[LEVEL1] != NULL) { + uint64_t *ptes = nodes[LEVEL1]->vgn_entries; + for (uint_t i = 0; i < (PAGESIZE / sizeof (uint64_t)); i++) + ASSERT3U(ptes[i], ==, 0); + } + for (uint_t i = LEVEL1; i > 0; i--) { + if (nodes[i] == NULL) + continue; + if (nodes[i]->vgn_ref_cnt != 0) + break; + vmm_gpt_node_remove_child(nodes[i - 1], nodes[i]); + } +} + +/* + * Cleans up the unused inner nodes in the GPT for a region of guest + * physical address space bounded by `start` and `end`. The region + * must map no pages. + */ +void +vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) +{ + for (uint64_t page = start; page <= end; page += PAGESIZE) { + vmm_gpt_vacate_entry(gpt, page); + } +} + +/* + * Remove a mapping from the table. Returns false if the page was not + * mapped, otherwise returns true. + */ +bool +vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa) +{ + uint64_t *entries[MAX_GPT_LEVEL], entry; + bool was_mapped; + + ASSERT(gpt != NULL); + vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); + if (entries[LEVEL1] == NULL) + return (false); + + entry = *entries[LEVEL1]; + *entries[LEVEL1] = 0; + was_mapped = gpt->vgpt_pte_ops->vpeo_pte_is_present(entry); + if (was_mapped) + gpt->vgpt_mapped_page_count--; + + return (was_mapped); +} + +/* + * Un-maps the region of guest physical address space bounded by + * start and end. Returns the number of pages that are unmapped. + */ +size_t +vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) +{ + size_t n = 0; + + for (uint64_t page = start; page <= end; page += PAGESIZE) { + if (vmm_gpt_unmap(gpt, page) != 0) + n++; + } + + return (n); +} + +/* + * Returns a value indicating whether or not this GPT maps the given + * GPA. If the GPA is mapped, *protp will be filled with the protection + * bits of the entry. Otherwise, it will be ignored. + */ +bool +vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t gpa, uint_t *protp) +{ + uint64_t *entries[MAX_GPT_LEVEL], entry; + + vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); + if (entries[LEVEL1] == NULL) + return (false); + entry = *entries[LEVEL1]; + if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) + return (false); + *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry); + + return (true); +} + +/* + * Returns the number of pages that are mapped in by this GPT. + */ +size_t +vmm_gpt_mapped_count(vmm_gpt_t *gpt) +{ + return (gpt->vgpt_mapped_page_count); +} + +/* + * Resets the accessed bit on the page table entry pointed to be `entry`. + * If `on` is true, the bit will be set, otherwise it will be cleared. + * The old value of the bit is returned. + */ +uint_t +vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on) +{ + ASSERT(entry != NULL); + return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on)); +} + +/* + * Resets the dirty bit on the page table entry pointed to be `entry`. + * If `on` is true, the bit will be set, otherwise it will be cleared. + * The old value of the bit is returned. + */ +uint_t +vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on) +{ + ASSERT(entry != NULL); + return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on)); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c index b43a6cac1d..3d357f37d2 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c @@ -17,31 +17,28 @@ #include <sys/types.h> #include <sys/param.h> +#include <sys/atomic.h> #include <sys/kmem.h> #include <sys/machsystm.h> #include <sys/mman.h> -#include <sys/gipt.h> +#include <sys/vmm_gpt.h> #include <sys/vmm_vm.h> +typedef struct ept_map ept_map_t; struct ept_map { - gipt_map_t em_gipt; - uint64_t em_wired_page_count; + vmm_gpt_t *em_gpt; + kmutex_t em_lock; }; -typedef struct ept_map ept_map_t; - -#define EPT_LOCK(m) (&(m)->em_gipt.giptm_lock) -#define EPT_MAX_LEVELS 4 - -CTASSERT(EPT_MAX_LEVELS <= GIPT_MAX_LEVELS); - -#define EPT_R (0x1 << 0) -#define EPT_W (0x1 << 1) -#define EPT_X (0x1 << 2) +#define EPT_R (1 << 0) +#define EPT_W (1 << 1) +#define EPT_X (1 << 2) #define EPT_RWX (EPT_R | EPT_W | EPT_X) -#define EPT_LGPG (0x1 << 7) +#define EPT_LGPG (1 << 7) +#define EPT_ACCESSED (1 << 8) +#define EPT_DIRTY (1 << 9) #define EPT_PA_MASK (0x000ffffffffff000ull) @@ -49,223 +46,183 @@ CTASSERT(EPT_R == PROT_READ); CTASSERT(EPT_W == PROT_WRITE); CTASSERT(EPT_X == PROT_EXEC); +static uint_t +ept_pte_prot(uint64_t pte) +{ + return (pte & EPT_RWX); +} -#define EPT_PAT(attr) (((attr) & 0x7) << 3) -#define EPT_PADDR(addr) ((addr) & EPT_PA_MASK) +static inline uint64_t +ept_attr_to_pat(uint8_t attr) +{ + uint64_t bits = attr & 0x7; + return (bits << 3); +} -#define EPT_IS_ABSENT(pte) (((pte) & EPT_RWX) == 0) -#define EPT_PTE_PFN(pte) mmu_btop(EPT_PADDR(pte)) -#define EPT_PTE_PROT(pte) ((pte) & EPT_RWX) -#define EPT_MAPS_PAGE(pte, lvl) \ - (EPT_PTE_PROT(pte) != 0 && (((pte) & EPT_LGPG) != 0 || (lvl) == 0)) +static uint64_t +ept_map_table(uint64_t pfn) +{ + const uint64_t paddr = pfn_to_pa(pfn) & EPT_PA_MASK; + return (paddr | EPT_RWX); +} -/* - * Only assign EPT_LGPG for levels higher than 0. Although this bit is defined - * as being ignored at level 0, some versions of VMWare fail to honor this and - * report such a PTE as an EPT mis-configuration. - */ -#define EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \ - (EPT_PADDR(pfn_to_pa(pfn)) | \ - (((lvl) != 0) ? EPT_LGPG : 0) | \ - EPT_PAT(attr) | ((prot) & EPT_RWX)) -#define EPT_PTE_ASSIGN_TABLE(pfn) (EPT_PADDR(pfn_to_pa(pfn)) | EPT_RWX) +static uint64_t +ept_map_page(uint64_t pfn, uint_t prot, uint8_t attr) +{ + const uint64_t paddr = pfn_to_pa(pfn) & EPT_PA_MASK; + const uint64_t pat = ept_attr_to_pat(attr); + const uint64_t rprot = prot & EPT_RWX; + return (paddr | pat | rprot); +} +static uint64_t +ept_pte_pfn(uint64_t pte) +{ + return (mmu_btop(pte & PT_PADDR)); +} -static gipt_pte_type_t -ept_pte_type(uint64_t pte, uint_t level) +static bool +ept_pte_is_present(uint64_t pte) { - if (EPT_IS_ABSENT(pte)) { - return (PTET_EMPTY); - } else if (EPT_MAPS_PAGE(pte, level)) { - return (PTET_PAGE); - } else { - return (PTET_LINK); - } + return ((pte & EPT_RWX) != 0); } -static uint64_t -ept_pte_map(uint64_t pfn) +static uint_t +ept_reset_bits(volatile uint64_t *entry, uint64_t mask, uint64_t bits) { - return (EPT_PTE_ASSIGN_TABLE(pfn)); + uint64_t pte, newpte, oldpte = 0; + + /* + * We use volatile and atomic ops here because we may be + * racing against hardware modifying these bits. + */ + VERIFY3P(entry, !=, NULL); + oldpte = *entry; + do { + pte = oldpte; + newpte = (pte & ~mask) | bits; + oldpte = atomic_cas_64(entry, pte, newpte); + } while (oldpte != pte); + + return (oldpte & mask); +} + +static uint_t +ept_reset_dirty(uint64_t *entry, bool on) +{ + return (ept_reset_bits(entry, EPT_DIRTY, + on ? (EPT_DIRTY | EPT_ACCESSED) : 0)); +} + +static uint_t +ept_reset_accessed(uint64_t *entry, bool on) +{ + return (ept_reset_bits(entry, EPT_DIRTY | EPT_ACCESSED, + on ? EPT_ACCESSED : 0)); +} + +static vmm_pte_ops_t ept_pte_ops = { + .vpeo_map_table = ept_map_table, + .vpeo_map_page = ept_map_page, + .vpeo_pte_pfn = ept_pte_pfn, + .vpeo_pte_is_present = ept_pte_is_present, + .vpeo_pte_prot = ept_pte_prot, + .vpeo_reset_dirty = ept_reset_dirty, + .vpeo_reset_accessed = ept_reset_accessed, +}; + +vmm_gpt_t * +ept_create(void) +{ + return (vmm_gpt_alloc(&ept_pte_ops)); } static void * -ept_create(uintptr_t *pml4_kaddr) +ept_ops_create(uintptr_t *root_kaddr) { - ept_map_t *emap; - gipt_map_t *map; - gipt_t *root; - struct gipt_cbs cbs = { - .giptc_pte_type = ept_pte_type, - .giptc_pte_map = ept_pte_map, - }; - - emap = kmem_zalloc(sizeof (*emap), KM_SLEEP); - map = &emap->em_gipt; - root = gipt_alloc(); - root->gipt_level = EPT_MAX_LEVELS - 1; - gipt_map_init(map, EPT_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root); - - *pml4_kaddr = (uintptr_t)root->gipt_kva; - return (emap); + ept_map_t *map; + + map = kmem_zalloc(sizeof (*map), KM_SLEEP); + mutex_init(&map->em_lock, NULL, MUTEX_DEFAULT, NULL); + map->em_gpt = ept_create(); + *root_kaddr = (uintptr_t)vmm_gpt_root_kaddr(map->em_gpt); + + return (map); } static void -ept_destroy(void *arg) +ept_ops_destroy(void *arg) { - ept_map_t *emap = arg; - - if (emap != NULL) { - gipt_map_t *map = &emap->em_gipt; + ept_map_t *map = arg; - gipt_map_fini(map); - kmem_free(emap, sizeof (*emap)); + if (map != NULL) { + vmm_gpt_free(map->em_gpt); + mutex_destroy(&map->em_lock); + kmem_free(map, sizeof (*map)); } } static uint64_t -ept_wired_count(void *arg) +ept_ops_wired_count(void *arg) { - ept_map_t *emap = arg; + ept_map_t *map = arg; uint64_t res; - mutex_enter(EPT_LOCK(emap)); - res = emap->em_wired_page_count; - mutex_exit(EPT_LOCK(emap)); + mutex_enter(&map->em_lock); + res = vmm_gpt_mapped_count(map->em_gpt); + mutex_exit(&map->em_lock); return (res); } static int -ept_is_wired(void *arg, uint64_t va, uint_t *protp) +ept_ops_is_wired(void *arg, uint64_t gpa, uint_t *protp) { - ept_map_t *emap = arg; - gipt_t *pt; - int rv = -1; - - mutex_enter(EPT_LOCK(emap)); - pt = gipt_map_lookup_deepest(&emap->em_gipt, va); - if (pt != NULL) { - const uint64_t pte = GIPT_VA2PTE(pt, va); - - if (EPT_MAPS_PAGE(pte, pt->gipt_level)) { - *protp = EPT_PTE_PROT(pte); - rv = 0; - } - } - mutex_exit(EPT_LOCK(emap)); + ept_map_t *map = arg; + bool mapped; + + mutex_enter(&map->em_lock); + mapped = vmm_gpt_is_mapped(map->em_gpt, gpa, protp); + mutex_exit(&map->em_lock); - return (rv); + return (mapped ? 0 : -1); } static int -ept_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot, +ept_ops_map(void *arg, uint64_t gpa, pfn_t pfn, uint_t _lvl, uint_t prot, uint8_t attr) { - ept_map_t *emap = arg; - gipt_map_t *map = &emap->em_gipt; - gipt_t *pt; - uint64_t *ptep, pte; + ept_map_t *map = arg; ASSERT((prot & EPT_RWX) != 0 && (prot & ~EPT_RWX) == 0); - ASSERT3U(lvl, <, EPT_MAX_LEVELS); - - mutex_enter(EPT_LOCK(emap)); - pt = gipt_map_lookup(map, va, lvl); - if (pt == NULL) { - /* - * A table at the appropriate VA/level that would house this - * mapping does not currently exist. Try to walk down to that - * point, creating any necessary parent(s). - */ - pt = gipt_map_create_parents(map, va, lvl); - - /* - * There was a large page mapping in the way of creating the - * necessary parent table(s). - */ - if (pt == NULL) { - panic("unexpected large page @ %08lx", va); - } - } - ptep = GIPT_VA2PTEP(pt, va); - - pte = *ptep; - if (!EPT_IS_ABSENT(pte)) { - if (!EPT_MAPS_PAGE(pte, lvl)) { - panic("unexpected PT link @ %08lx in %p", va, pt); - } else { - panic("unexpected page mapped @ %08lx in %p", va, pt); - } - } - pte = EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr); - *ptep = pte; - pt->gipt_valid_cnt++; - emap->em_wired_page_count += gipt_level_count[lvl]; + mutex_enter(&map->em_lock); + vmm_gpt_populate_entry(map->em_gpt, gpa); + (void) vmm_gpt_map(map->em_gpt, gpa, pfn, prot, attr); + mutex_exit(&map->em_lock); - mutex_exit(EPT_LOCK(emap)); return (0); } static uint64_t -ept_unmap(void *arg, uint64_t va, uint64_t end_va) +ept_ops_unmap(void *arg, uint64_t start, uint64_t end) { - ept_map_t *emap = arg; - gipt_map_t *map = &emap->em_gipt; - gipt_t *pt; - uint64_t cur_va = va; - uint64_t unmapped = 0; - - mutex_enter(EPT_LOCK(emap)); - - pt = gipt_map_lookup_deepest(map, cur_va); - if (pt == NULL) { - mutex_exit(EPT_LOCK(emap)); - return (0); - } - if (!EPT_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) { - cur_va = gipt_map_next_page(map, cur_va, end_va, &pt); - if (cur_va == 0) { - mutex_exit(EPT_LOCK(emap)); - return (0); - } - } - - while (cur_va < end_va) { - uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va); - const uint_t lvl = pt->gipt_level; - - ASSERT(EPT_MAPS_PAGE(*ptep, lvl)); - *ptep = 0; - pt->gipt_valid_cnt--; - unmapped += gipt_level_count[pt->gipt_level]; - - gipt_t *next_pt = pt; - uint64_t next_va; - next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt); - - if (pt->gipt_valid_cnt == 0) { - gipt_map_clean_parents(map, pt); - } - if (next_va == 0) { - break; - } - pt = next_pt; - cur_va = next_va; - } - emap->em_wired_page_count -= unmapped; + ept_map_t *map = arg; + size_t unmapped = 0; - mutex_exit(EPT_LOCK(emap)); + mutex_enter(&map->em_lock); + unmapped = vmm_gpt_unmap_region(map->em_gpt, start, end); + vmm_gpt_vacate_region(map->em_gpt, start, end); + mutex_exit(&map->em_lock); - return (unmapped); + return ((uint64_t)unmapped); } struct vmm_pt_ops ept_ops = { - .vpo_init = ept_create, - .vpo_free = ept_destroy, - .vpo_wired_cnt = ept_wired_count, - .vpo_is_wired = ept_is_wired, - .vpo_map = ept_map, - .vpo_unmap = ept_unmap, + .vpo_init = ept_ops_create, + .vpo_free = ept_ops_destroy, + .vpo_wired_cnt = ept_ops_wired_count, + .vpo_is_wired = ept_ops_is_wired, + .vpo_map = ept_ops_map, + .vpo_unmap = ept_ops_unmap, }; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c index f82ea64994..c66a4e7962 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c @@ -17,284 +17,237 @@ #include <sys/types.h> #include <sys/param.h> +#include <sys/atomic.h> #include <sys/kmem.h> #include <sys/machsystm.h> #include <sys/mach_mmu.h> #include <sys/mman.h> #include <sys/x86_archext.h> +#include <vm/hat_pte.h> -#include <sys/gipt.h> +#include <sys/vmm_gpt.h> #include <sys/vmm_vm.h> +typedef struct rvi_map rvi_map_t; struct rvi_map { - gipt_map_t rm_gipt; - uint64_t rm_wired_page_count; + vmm_gpt_t *rm_gpt; + kmutex_t rm_lock; }; -typedef struct rvi_map rvi_map_t; -#define RVI_LOCK(m) (&(m)->rm_gipt.giptm_lock) - -#define RVI_MAX_LEVELS 4 - -CTASSERT(RVI_MAX_LEVELS <= GIPT_MAX_LEVELS); - -#define RVI_PRESENT PT_VALID -#define RVI_WRITABLE PT_WRITABLE -#define RVI_ACCESSED PT_REF -#define RVI_DIRTY PT_MOD -#define RVI_LGPG PT_PAGESIZE -#define RVI_NX PT_NX -#define RVI_USER PT_USER -#define RVI_PWT PT_WRITETHRU -#define RVI_PCD PT_NOCACHE - -#define RVI_PA_MASK PT_PADDR - -#define RVI_PAT(attr) rvi_attr_to_pat(attr) -#define RVI_PADDR(addr) ((addr) & RVI_PA_MASK) -#define RVI_PROT(prot) \ - ((((prot) & PROT_WRITE) != 0 ? RVI_WRITABLE : 0) | \ - (((prot) & PROT_EXEC) == 0 ? RVI_NX : 0)) - -#define RVI_IS_ABSENT(pte) (((pte) & RVI_PRESENT) == 0) -#define RVI_PTE_PFN(pte) mmu_btop(RVI_PADDR(pte)) -#define RVI_MAPS_PAGE(pte, lvl) \ - (!RVI_IS_ABSENT(pte) && (((pte) & RVI_LGPG) != 0 || (lvl) == 0)) -#define RVI_PTE_PROT(pte) \ - (RVI_IS_ABSENT(pte) ? 0 : ( \ - PROT_READ | \ - (((pte) & RVI_NX) == 0 ? PROT_EXEC : 0) | \ - (((pte) & RVI_WRITABLE) != 0 ? PROT_WRITE : 0))) - -#define RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \ - (RVI_PADDR(pfn_to_pa(pfn)) | \ - (((lvl) != 0) ? RVI_LGPG : 0) | \ - RVI_USER | RVI_ACCESSED | RVI_PRESENT | \ - RVI_PAT(attr) | \ - RVI_PROT(prot)) - -#define RVI_PTE_ASSIGN_TABLE(pfn) \ - (RVI_PADDR(pfn_to_pa(pfn)) | \ - RVI_USER | RVI_ACCESSED | RVI_PRESENT | \ - RVI_PAT(MTRR_TYPE_WB) | \ - RVI_PROT(PROT_READ | PROT_WRITE | PROT_EXEC)) +static inline uint64_t +rvi_prot(uint_t prot) +{ + uint64_t bits; + + bits = 0; + if ((prot & PROT_WRITE) != 0) + bits |= PT_WRITABLE; + if ((prot & PROT_EXEC) == 0) + bits |= PT_NX; + + return (bits); +} +static uint_t +rvi_pte_prot(uint64_t pte) +{ + uint_t prot; + + if ((pte & PT_VALID) == 0) + return (0); + + prot = PROT_READ; + if ((pte & PT_NX) == 0) + prot |= PROT_EXEC; + if ((pte & PT_WRITABLE) != 0) + prot |= PROT_WRITE; + + return (prot); +} /* Make sure that PAT indexes line up as expected */ CTASSERT((PAT_DEFAULT_ATTRIBUTE & 0xf) == MTRR_TYPE_WB); CTASSERT(((PAT_DEFAULT_ATTRIBUTE >> 24) & 0xf) == MTRR_TYPE_UC); static inline uint64_t -rvi_attr_to_pat(const uint8_t attr) +rvi_attr_to_pat(uint8_t attr) { - if (attr == MTRR_TYPE_UC) { - /* !PAT + PCD + PWT -> PAT3 -> MTRR_TYPE_UC */ - return (RVI_PCD|RVI_PWT); - } else if (attr == MTRR_TYPE_WB) { - /* !PAT + !PCD + !PWT -> PAT0 -> MTRR_TYPE_WB */ + + if (attr == MTRR_TYPE_UC) + return (PT_NOCACHE | PT_WRITETHRU); + if (attr == MTRR_TYPE_WB) return (0); - } panic("unexpected memattr %x", attr); - return (0); } -static gipt_pte_type_t -rvi_pte_type(uint64_t pte, uint_t level) +static uint64_t +rvi_map_table(uint64_t pfn) { - if (RVI_IS_ABSENT(pte)) { - return (PTET_EMPTY); - } else if (RVI_MAPS_PAGE(pte, level)) { - return (PTET_PAGE); - } else { - return (PTET_LINK); - } + const uint64_t paddr = pfn_to_pa(pfn); + const uint64_t flags = PT_USER | PT_REF | PT_VALID; + const uint64_t pat = rvi_attr_to_pat(MTRR_TYPE_WB); + const uint64_t rprot = PT_WRITABLE; + return (paddr | flags | pat | rprot); } static uint64_t -rvi_pte_map(uint64_t pfn) +rvi_map_page(uint64_t pfn, uint_t prot, uint8_t attr) +{ + const uint64_t paddr = pfn_to_pa(pfn); + const uint64_t flags = PT_USER | PT_REF | PT_VALID; + const uint64_t pat = rvi_attr_to_pat(attr); + const uint64_t rprot = rvi_prot(prot); + return (paddr | flags | pat | rprot); +} + +static pfn_t +rvi_pte_pfn(uint64_t pte) +{ + return (mmu_btop(pte & PT_PADDR)); +} + +static bool +rvi_pte_is_present(uint64_t pte) +{ + return ((pte & PT_VALID) == PT_VALID); +} + +static uint_t +rvi_reset_bits(volatile uint64_t *entry, uint64_t mask, uint64_t bits) +{ + uint64_t pte, newpte, oldpte = 0; + + /* + * We use volatile and atomic ops here because we may be + * racing against hardware modifying these bits. + */ + VERIFY3P(entry, !=, NULL); + oldpte = *entry; + do { + pte = oldpte; + newpte = (pte & ~mask) | bits; + oldpte = atomic_cas_64(entry, pte, newpte); + } while (oldpte != pte); + + return (oldpte & mask); +} + +static uint_t +rvi_reset_dirty(uint64_t *entry, bool on) +{ + return (rvi_reset_bits(entry, PT_MOD, on ? (PT_MOD | PT_REF) : 0)); +} + +static uint_t +rvi_reset_accessed(uint64_t *entry, bool on) { - return (RVI_PTE_ASSIGN_TABLE(pfn)); + return (rvi_reset_bits(entry, (PT_MOD | PT_REF), on ? PT_REF : 0)); +} + +static vmm_pte_ops_t rvi_pte_ops = { + .vpeo_map_table = rvi_map_table, + .vpeo_map_page = rvi_map_page, + .vpeo_pte_pfn = rvi_pte_pfn, + .vpeo_pte_is_present = rvi_pte_is_present, + .vpeo_pte_prot = rvi_pte_prot, + .vpeo_reset_dirty = rvi_reset_dirty, + .vpeo_reset_accessed = rvi_reset_accessed, +}; + +vmm_gpt_t * +rvi_create(void) +{ + return (vmm_gpt_alloc(&rvi_pte_ops)); } static void * -rvi_create(uintptr_t *pml4_kaddr) +rvi_ops_create(uintptr_t *root_kaddr) { - rvi_map_t *rmap; - gipt_map_t *map; - gipt_t *root; - struct gipt_cbs cbs = { - .giptc_pte_type = rvi_pte_type, - .giptc_pte_map = rvi_pte_map, - }; - - rmap = kmem_zalloc(sizeof (*rmap), KM_SLEEP); - map = &rmap->rm_gipt; - root = gipt_alloc(); - root->gipt_level = RVI_MAX_LEVELS - 1; - gipt_map_init(map, RVI_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root); - - *pml4_kaddr = (uintptr_t)root->gipt_kva; - return (rmap); + rvi_map_t *map; + + map = kmem_zalloc(sizeof (*map), KM_SLEEP); + mutex_init(&map->rm_lock, NULL, MUTEX_DEFAULT, NULL); + map->rm_gpt = rvi_create(); + *root_kaddr = (uintptr_t)vmm_gpt_root_kaddr(map->rm_gpt); + + return (map); } static void -rvi_destroy(void *arg) +rvi_ops_destroy(void *arg) { - rvi_map_t *rmap = arg; - - if (rmap != NULL) { - gipt_map_t *map = &rmap->rm_gipt; + rvi_map_t *map = arg; - gipt_map_fini(map); - kmem_free(rmap, sizeof (*rmap)); + if (map != NULL) { + vmm_gpt_free(map->rm_gpt); + mutex_destroy(&map->rm_lock); + kmem_free(map, sizeof (*map)); } } static uint64_t -rvi_wired_count(void *arg) +rvi_ops_wired_count(void *arg) { - rvi_map_t *rmap = arg; + rvi_map_t *map = arg; uint64_t res; - mutex_enter(RVI_LOCK(rmap)); - res = rmap->rm_wired_page_count; - mutex_exit(RVI_LOCK(rmap)); + mutex_enter(&map->rm_lock); + res = vmm_gpt_mapped_count(map->rm_gpt); + mutex_exit(&map->rm_lock); return (res); } static int -rvi_is_wired(void *arg, uint64_t va, uint_t *protp) +rvi_ops_is_wired(void *arg, uint64_t gpa, uint_t *protp) { - rvi_map_t *rmap = arg; - gipt_t *pt; - int rv = -1; - - mutex_enter(RVI_LOCK(rmap)); - pt = gipt_map_lookup_deepest(&rmap->rm_gipt, va); - if (pt != NULL) { - const uint64_t pte = GIPT_VA2PTE(pt, va); - - if (RVI_MAPS_PAGE(pte, pt->gipt_level)) { - *protp = RVI_PTE_PROT(pte); - rv = 0; - } - } - mutex_exit(RVI_LOCK(rmap)); + rvi_map_t *map = arg; + bool mapped; + + mutex_enter(&map->rm_lock); + mapped = vmm_gpt_is_mapped(map->rm_gpt, gpa, protp); + mutex_exit(&map->rm_lock); - return (rv); + return (mapped ? 0 : -1); } static int -rvi_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot, +rvi_ops_map(void *arg, uint64_t gpa, pfn_t pfn, uint_t _lvl, uint_t prot, uint8_t attr) { - rvi_map_t *rmap = arg; - gipt_map_t *map = &rmap->rm_gipt; - gipt_t *pt; - uint64_t *ptep, pte; + rvi_map_t *map = arg; ASSERT((prot & PROT_READ) != 0); ASSERT3U((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)), ==, 0); - ASSERT3U(lvl, <, RVI_MAX_LEVELS); - - mutex_enter(RVI_LOCK(rmap)); - pt = gipt_map_lookup(map, va, lvl); - if (pt == NULL) { - /* - * A table at the appropriate VA/level that would house this - * mapping does not currently exist. Try to walk down to that - * point, creating any necessary parent(s). - */ - pt = gipt_map_create_parents(map, va, lvl); - - /* - * There was a large page mapping in the way of creating the - * necessary parent table(s). - */ - if (pt == NULL) { - panic("unexpected large page @ %08lx", va); - } - } - ptep = GIPT_VA2PTEP(pt, va); - - pte = *ptep; - if (!RVI_IS_ABSENT(pte)) { - if (!RVI_MAPS_PAGE(pte, lvl)) { - panic("unexpected PT link @ %08lx in %p", va, pt); - } else { - panic("unexpected page mapped @ %08lx in %p", va, pt); - } - } - pte = RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr); - *ptep = pte; - pt->gipt_valid_cnt++; - rmap->rm_wired_page_count += gipt_level_count[lvl]; + mutex_enter(&map->rm_lock); + vmm_gpt_populate_entry(map->rm_gpt, gpa); + (void) vmm_gpt_map(map->rm_gpt, gpa, pfn, prot, attr); + mutex_exit(&map->rm_lock); - mutex_exit(RVI_LOCK(rmap)); return (0); } static uint64_t -rvi_unmap(void *arg, uint64_t va, uint64_t end_va) +rvi_ops_unmap(void *arg, uint64_t start, uint64_t end) { - rvi_map_t *rmap = arg; - gipt_map_t *map = &rmap->rm_gipt; - gipt_t *pt; - uint64_t cur_va = va; - uint64_t unmapped = 0; - - mutex_enter(RVI_LOCK(rmap)); - - pt = gipt_map_lookup_deepest(map, cur_va); - if (pt == NULL) { - mutex_exit(RVI_LOCK(rmap)); - return (0); - } - if (!RVI_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) { - cur_va = gipt_map_next_page(map, cur_va, end_va, &pt); - if (cur_va == 0) { - mutex_exit(RVI_LOCK(rmap)); - return (0); - } - } - - while (cur_va < end_va) { - uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va); - const uint_t lvl = pt->gipt_level; - - ASSERT(RVI_MAPS_PAGE(*ptep, lvl)); - *ptep = 0; - pt->gipt_valid_cnt--; - unmapped += gipt_level_count[pt->gipt_level]; - - gipt_t *next_pt = pt; - uint64_t next_va; - next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt); - - if (pt->gipt_valid_cnt == 0) { - gipt_map_clean_parents(map, pt); - } - if (next_va == 0) { - break; - } - pt = next_pt; - cur_va = next_va; - } - rmap->rm_wired_page_count -= unmapped; + rvi_map_t *map = arg; + size_t unmapped = 0; - mutex_exit(RVI_LOCK(rmap)); + mutex_enter(&map->rm_lock); + unmapped = vmm_gpt_unmap_region(map->rm_gpt, start, end); + vmm_gpt_vacate_region(map->rm_gpt, start, end); + mutex_exit(&map->rm_lock); - return (unmapped); + return ((uint64_t)unmapped); } struct vmm_pt_ops rvi_ops = { - .vpo_init = rvi_create, - .vpo_free = rvi_destroy, - .vpo_wired_cnt = rvi_wired_count, - .vpo_is_wired = rvi_is_wired, - .vpo_map = rvi_map, - .vpo_unmap = rvi_unmap, + .vpo_init = rvi_ops_create, + .vpo_free = rvi_ops_destroy, + .vpo_wired_cnt = rvi_ops_wired_count, + .vpo_is_wired = rvi_ops_is_wired, + .vpo_map = rvi_ops_map, + .vpo_unmap = rvi_ops_unmap, }; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c index 3a29a3e7b3..720af54200 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c @@ -35,6 +35,7 @@ #include <vm/seg_vmm.h> #include <machine/vm.h> +#include <sys/vmm_gpt.h> #include <sys/vmm_vm.h> #define PMAP_TO_VMMAP(pm) ((vm_map_t) \ diff --git a/usr/src/uts/i86pc/os/gipt.c b/usr/src/uts/i86pc/os/gipt.c deleted file mode 100644 index 7bff5c3897..0000000000 --- a/usr/src/uts/i86pc/os/gipt.c +++ /dev/null @@ -1,568 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2019 Joyent, Inc. - */ - -#include <sys/gipt.h> -#include <sys/malloc.h> -#include <sys/kmem.h> -#include <sys/sysmacros.h> -#include <sys/sunddi.h> -#include <sys/panic.h> -#include <vm/hat.h> -#include <vm/as.h> - -/* - * Generic Indexed Page Table - * - * There are several applications, such as hardware virtualization or IOMMU - * control, which require construction of a page table tree to represent a - * virtual address space. Many features of the existing htable system would be - * convenient for this, but its tight coupling to the VM system make it - * undesirable for independent consumers. The GIPT interface exists to provide - * page table allocation and indexing on top of which a table hierarchy - * (EPT, VT-d, etc) can be built by upstack logic. - * - * Types: - * - * gipt_t - Represents a single page table with a physical backing page and - * associated metadata. - * gipt_map_t - The workhorse of this facility, it contains an hash table to - * index all of the gipt_t entries which make up the page table tree. - * struct gipt_cbs - Callbacks used by the gipt_map_t: - * gipt_pte_type_cb_t - Given a PTE, emit the type (empty/page/table) - * gipt_pte_map_cb_t - Given a PFN, emit a (child) table mapping - */ - -/* - * For now, the level shifts are hard-coded to match with standard 4-level - * 64-bit paging structures. - */ - -#define GIPT_HASH(map, va, lvl) \ - ((((va) >> 12) + ((va) >> 28) + (lvl)) & ((map)->giptm_table_cnt - 1)) - -const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1] = { - 12, /* 4K */ - 21, /* 2M */ - 30, /* 1G */ - 39, /* 512G */ - 48 /* MAX */ -}; -const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1] = { - 0xfffffffffffff000ull, /* 4K */ - 0xffffffffffe00000ull, /* 2M */ - 0xffffffffc0000000ull, /* 1G */ - 0xffffff8000000000ull, /* 512G */ - 0xffff000000000000ull /* MAX */ -}; -const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1] = { - 0x0000000000001000ull, /* 4K */ - 0x0000000000200000ull, /* 2M */ - 0x0000000040000000ull, /* 1G */ - 0x0000008000000000ull, /* 512G */ - 0x0001000000000000ull /* MAX */ -}; -const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1] = { - 0x0000000000000001ull, /* 4K */ - 0x0000000000000200ull, /* 2M */ - 0x0000000000040000ull, /* 1G */ - 0x0000000008000000ull, /* 512G */ - 0x0000001000000000ull /* MAX */ -}; - -/* - * Allocate a gipt_t structure with corresponding page of memory to hold the - * PTEs which it contains. - */ -gipt_t * -gipt_alloc(void) -{ - gipt_t *pt; - void *page; - - pt = kmem_zalloc(sizeof (*pt), KM_SLEEP); - page = kmem_zalloc(PAGESIZE, KM_SLEEP); - pt->gipt_kva = page; - pt->gipt_pfn = hat_getpfnum(kas.a_hat, page); - - return (pt); -} - -/* - * Free a gipt_t structure along with its page of PTE storage. - */ -void -gipt_free(gipt_t *pt) -{ - void *page = pt->gipt_kva; - - ASSERT(pt->gipt_pfn != PFN_INVALID); - ASSERT(pt->gipt_kva != NULL); - - pt->gipt_pfn = PFN_INVALID; - pt->gipt_kva = NULL; - - kmem_free(page, PAGESIZE); - kmem_free(pt, sizeof (*pt)); -} - -/* - * Initialize a gipt_map_t with a max level (must be >= 1) and allocating its - * hash table based on a provided size (must be a power of 2). - */ -void -gipt_map_init(gipt_map_t *map, uint_t levels, uint_t hash_table_size, - const struct gipt_cbs *cbs, gipt_t *root) -{ - VERIFY(map->giptm_root == NULL); - VERIFY(map->giptm_hash == NULL); - VERIFY3U(levels, >, 0); - VERIFY3U(levels, <=, GIPT_MAX_LEVELS); - VERIFY(ISP2(hash_table_size)); - VERIFY(root != NULL); - - mutex_init(&map->giptm_lock, NULL, MUTEX_DEFAULT, NULL); - map->giptm_table_cnt = hash_table_size; - bcopy(cbs, &map->giptm_cbs, sizeof (*cbs)); - map->giptm_hash = kmem_alloc(sizeof (list_t) * map->giptm_table_cnt, - KM_SLEEP); - for (uint_t i = 0; i < hash_table_size; i++) { - list_create(&map->giptm_hash[i], sizeof (gipt_t), - offsetof(gipt_t, gipt_node)); - } - map->giptm_levels = levels; - - /* - * Insert the table root into the hash. It will be held in existence - * with an extra "valid" reference. This will prevent its clean-up - * during gipt_map_clean_parents() calls, even if it has no children. - */ - mutex_enter(&map->giptm_lock); - gipt_map_insert(map, root); - map->giptm_root = root; - root->gipt_valid_cnt++; - mutex_exit(&map->giptm_lock); -} - -/* - * Clean up a gipt_map_t by removing any lingering gipt_t entries referenced by - * it, and freeing its hash table. - */ -void -gipt_map_fini(gipt_map_t *map) -{ - const uint_t cnt = map->giptm_table_cnt; - const size_t sz = sizeof (list_t) * cnt; - - mutex_enter(&map->giptm_lock); - /* Clean up any lingering tables */ - for (uint_t i = 0; i < cnt; i++) { - list_t *list = &map->giptm_hash[i]; - gipt_t *pt; - - while ((pt = list_remove_head(list)) != NULL) { - gipt_free(pt); - } - ASSERT(list_is_empty(list)); - } - - kmem_free(map->giptm_hash, sz); - map->giptm_hash = NULL; - map->giptm_root = NULL; - map->giptm_levels = 0; - mutex_exit(&map->giptm_lock); - mutex_destroy(&map->giptm_lock); -} - -/* - * Look in the map for a gipt_t containing a given VA which is located at a - * specified level. - */ -gipt_t * -gipt_map_lookup(gipt_map_t *map, uint64_t va, uint_t lvl) -{ - gipt_t *pt; - - ASSERT(MUTEX_HELD(&map->giptm_lock)); - ASSERT3U(lvl, <=, GIPT_MAX_LEVELS); - - /* - * Lookup gipt_t at the VA aligned to the next level up. For example, - * level 0 corresponds to a page table containing 512 PTEs which cover - * 4k each, spanning a total 2MB. As such, the base VA of that table - * must be aligned to the same 2MB. - */ - const uint64_t masked_va = va & gipt_level_mask[lvl + 1]; - const uint_t hash = GIPT_HASH(map, masked_va, lvl); - - /* Only the root is expected to be at the top level. */ - if (lvl == (map->giptm_levels - 1) && map->giptm_root != NULL) { - pt = map->giptm_root; - - ASSERT3U(pt->gipt_level, ==, lvl); - - /* - * It may be so that the VA in question is not covered by the - * range of the table root. - */ - if (pt->gipt_vaddr != masked_va) { - return (NULL); - } - - return (pt); - } - - list_t *list = &map->giptm_hash[hash]; - for (pt = list_head(list); pt != NULL; pt = list_next(list, pt)) { - if (pt->gipt_vaddr == masked_va && pt->gipt_level == lvl) - break; - } - return (pt); -} - -/* - * Look in the map for the deepest (lowest level) gipt_t which contains a given - * VA. This could still fail if the VA is outside the range of the table root. - */ -gipt_t * -gipt_map_lookup_deepest(gipt_map_t *map, uint64_t va) -{ - gipt_t *pt = NULL; - uint_t lvl; - - ASSERT(MUTEX_HELD(&map->giptm_lock)); - - for (lvl = 0; lvl < map->giptm_levels; lvl++) { - pt = gipt_map_lookup(map, va, lvl); - if (pt != NULL) { - break; - } - } - return (pt); -} - -/* - * Given a VA inside a gipt_t, calculate (based on the level of that PT) the VA - * corresponding to the next entry in the table. It returns 0 if that VA would - * fall beyond the bounds of the table. - */ -static __inline__ uint64_t -gipt_next_va(gipt_t *pt, uint64_t va) -{ - const uint_t lvl = pt->gipt_level; - const uint64_t masked = va & gipt_level_mask[lvl]; - const uint64_t max = pt->gipt_vaddr + gipt_level_size[lvl+1]; - const uint64_t next = masked + gipt_level_size[lvl]; - - ASSERT3U(masked, >=, pt->gipt_vaddr); - ASSERT3U(masked, <, max); - - /* - * If the "next" VA would be outside this table, including cases where - * it overflowed, indicate an error result. - */ - if (next >= max || next <= masked) { - return (0); - } - return (next); -} - -/* - * For a given VA, find the next VA which corresponds to a valid page mapping. - * The gipt_t containing that VA will be indicated via 'ptp'. (The gipt_t of - * the starting VA can be passed in via 'ptp' for a minor optimization). If - * there is no valid mapping higher than 'va' but contained within 'max_va', - * then this will indicate failure with 0 returned. - */ -uint64_t -gipt_map_next_page(gipt_map_t *map, uint64_t va, uint64_t max_va, gipt_t **ptp) -{ - gipt_t *pt = *ptp; - uint64_t cur_va = va; - gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type; - - ASSERT(MUTEX_HELD(&map->giptm_lock)); - ASSERT3U(max_va, !=, 0); - ASSERT3U(ptp, !=, NULL); - - /* - * If a starting table is not provided, search the map for the deepest - * table which contains the VA. If for some reason that VA is beyond - * coverage of the map root, indicate failure. - */ - if (pt == NULL) { - pt = gipt_map_lookup_deepest(map, cur_va); - if (pt == NULL) { - goto fail; - } - } - - /* - * From the starting table (at whatever level that may reside), walk - * forward through the PTEs looking for a valid page mapping. - */ - while (cur_va < max_va) { - const uint64_t next_va = gipt_next_va(pt, cur_va); - if (next_va == 0) { - /* - * The end of this table has been reached. Ascend one - * level to continue the walk if possible. If already - * at the root, the end of the table means failure. - */ - if (pt->gipt_level >= map->giptm_levels) { - goto fail; - } - pt = gipt_map_lookup(map, cur_va, pt->gipt_level + 1); - if (pt == NULL) { - goto fail; - } - continue; - } else if (next_va >= max_va) { - /* - * Terminate the walk with a failure if the VA - * corresponding to the next PTE is beyond the max. - */ - goto fail; - } - cur_va = next_va; - - const uint64_t pte = GIPT_VA2PTE(pt, cur_va); - const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level); - if (ptet == PTET_EMPTY) { - continue; - } else if (ptet == PTET_PAGE) { - /* A valid page mapping: success. */ - *ptp = pt; - return (cur_va); - } else if (ptet == PTET_LINK) { - /* - * A child page table is present at this PTE. Look it - * up from the map. - */ - ASSERT3U(pt->gipt_level, >, 0); - pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1); - ASSERT3P(pt, !=, NULL); - break; - } else { - panic("unexpected PTE type %x @ va %p", ptet, - (void *)cur_va); - } - } - - /* - * By this point, the above loop has located a table structure to - * descend into in order to find the next page. - */ - while (cur_va < max_va) { - const uint64_t pte = GIPT_VA2PTE(pt, cur_va); - const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level); - - if (ptet == PTET_EMPTY) { - const uint64_t next_va = gipt_next_va(pt, cur_va); - if (next_va == 0 || next_va >= max_va) { - goto fail; - } - cur_va = next_va; - continue; - } else if (ptet == PTET_PAGE) { - /* A valid page mapping: success. */ - *ptp = pt; - return (cur_va); - } else if (ptet == PTET_LINK) { - /* - * A child page table is present at this PTE. Look it - * up from the map. - */ - ASSERT3U(pt->gipt_level, >, 0); - pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1); - ASSERT3P(pt, !=, NULL); - } else { - panic("unexpected PTE type %x @ va %p", ptet, - (void *)cur_va); - } - } - -fail: - *ptp = NULL; - return (0); -} - -/* - * Insert a gipt_t into the map based on its VA and level. It is up to the - * caller to ensure that a duplicate entry does not already exist in the map. - */ -void -gipt_map_insert(gipt_map_t *map, gipt_t *pt) -{ - const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level); - - ASSERT(MUTEX_HELD(&map->giptm_lock)); - ASSERT(gipt_map_lookup(map, pt->gipt_vaddr, pt->gipt_level) == NULL); - VERIFY3U(pt->gipt_level, <, map->giptm_levels); - - list_insert_head(&map->giptm_hash[hash], pt); -} - -/* - * Remove a gipt_t from the map. - */ -void -gipt_map_remove(gipt_map_t *map, gipt_t *pt) -{ - const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level); - - ASSERT(MUTEX_HELD(&map->giptm_lock)); - - list_remove(&map->giptm_hash[hash], pt); -} - -/* - * Given a VA, create any missing gipt_t entries from the specified level all - * the way up to (but not including) the root. This is done from lowest level - * to highest, and stops when an existing table covering that VA is found. - * References to any created gipt_t tables, plus the final "found" gipt_t are - * stored in 'pts'. The number of gipt_t pointers stored to 'pts' serves as - * the return value (1 <= val <= root level). It is up to the caller to - * populate linking PTEs to the newly created empty tables. - */ -static uint_t -gipt_map_ensure_chain(gipt_map_t *map, uint64_t va, uint_t lvl, gipt_t **pts) -{ - const uint_t root_lvl = map->giptm_root->gipt_level; - uint_t clvl = lvl, count = 0; - gipt_t *child_pt = NULL; - - ASSERT(MUTEX_HELD(&map->giptm_lock)); - ASSERT3U(lvl, <, root_lvl); - ASSERT3P(map->giptm_root, !=, NULL); - - do { - const uint64_t pva = (va & gipt_level_mask[clvl + 1]); - gipt_t *pt; - - pt = gipt_map_lookup(map, pva, clvl); - if (pt != NULL) { - ASSERT3U(pva, ==, pt->gipt_vaddr); - - if (child_pt != NULL) { - child_pt->gipt_parent = pt; - } - pts[count++] = pt; - return (count); - } - - pt = gipt_alloc(); - pt->gipt_vaddr = pva; - pt->gipt_level = clvl; - if (child_pt != NULL) { - child_pt->gipt_parent = pt; - } - - gipt_map_insert(map, pt); - child_pt = pt; - pts[count++] = pt; - clvl++; - } while (clvl <= root_lvl); - - return (count); -} - -/* - * Ensure that a page table covering a VA at a specified level exists. This - * will create any necessary tables chaining up to the root as well. - */ -gipt_t * -gipt_map_create_parents(gipt_map_t *map, uint64_t va, uint_t lvl) -{ - gipt_t *pt, *pts[GIPT_MAX_LEVELS] = { 0 }; - gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type; - gipt_pte_map_cb_t pte_map = map->giptm_cbs.giptc_pte_map; - uint64_t *ptep; - uint_t i, count; - - ASSERT(MUTEX_HELD(&map->giptm_lock)); - - count = gipt_map_ensure_chain(map, va, lvl, pts); - if (count == 1) { - /* Table already exists in the hierarchy */ - return (pts[0]); - } - ASSERT3U(count, >, 1); - - /* Make sure there is not already a large page mapping at the top */ - pt = pts[count - 1]; - if (pte_type(GIPT_VA2PTE(pt, va), pt->gipt_level) == PTET_PAGE) { - const uint_t end = count - 1; - - /* - * Nuke those gipt_t entries which were optimistically created - * for what was found to be a conflicted mapping. - */ - for (i = 0; i < end; i++) { - gipt_map_remove(map, pts[i]); - gipt_free(pts[i]); - } - return (NULL); - } - - /* Initialize the appropriate tables from bottom to top */ - for (i = 1; i < count; i++) { - pt = pts[i]; - ptep = GIPT_VA2PTEP(pt, va); - - /* - * Since gipt_map_ensure_chain() creates missing tables until - * it find a valid one, and that existing table has been - * checked for the existence of a large page, nothing should - * occupy this PTE. - */ - ASSERT3U(pte_type(*ptep, pt->gipt_level), ==, PTET_EMPTY); - - *ptep = pte_map(pts[i - 1]->gipt_pfn); - pt->gipt_valid_cnt++; - } - - return (pts[0]); -} - -/* - * If a page table is empty, free it from the map, as well as any parent tables - * that would subsequently become empty as part of the clean-up. As noted in - * gipt_map_init(), the table root is a special case and will remain in the - * map, even when empty. - */ -void -gipt_map_clean_parents(gipt_map_t *map, gipt_t *pt) -{ - ASSERT(MUTEX_HELD(&map->giptm_lock)); - - while (pt->gipt_valid_cnt == 0) { - gipt_t *parent = pt->gipt_parent; - uint64_t *ptep = GIPT_VA2PTEP(parent, pt->gipt_vaddr); - - ASSERT3S(map->giptm_cbs.giptc_pte_type(*ptep, - parent->gipt_level), ==, PTET_LINK); - - /* - * For now, it is assumed that all gipt consumers consider PTE - * zeroing as an adequate action for table unmap. - */ - *ptep = 0; - - parent->gipt_valid_cnt--; - gipt_map_remove(map, pt); - gipt_free(pt); - pt = parent; - } -} diff --git a/usr/src/uts/i86pc/sys/gipt.h b/usr/src/uts/i86pc/sys/gipt.h deleted file mode 100644 index 4d7d523726..0000000000 --- a/usr/src/uts/i86pc/sys/gipt.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2019 Joyent, Inc. - */ - -#ifndef _GIPT_H_ -#define _GIPT_H_ - -#include <sys/types.h> -#include <sys/mutex.h> -#include <sys/param.h> -#include <sys/list.h> - -struct gipt { - list_node_t gipt_node; - uint64_t gipt_vaddr; - uint64_t gipt_pfn; - uint16_t gipt_level; - uint16_t gipt_valid_cnt; - uint32_t _gipt_pad; - struct gipt *gipt_parent; - uint64_t *gipt_kva; - uint64_t _gipt_pad2; -}; -typedef struct gipt gipt_t; - -typedef enum { - PTET_EMPTY = 0, - PTET_PAGE = 1, - PTET_LINK = 2, -} gipt_pte_type_t; - -/* Given a PTE and its level, determine the type of that PTE */ -typedef gipt_pte_type_t (*gipt_pte_type_cb_t)(uint64_t, uint_t); -/* Given the PFN of a child table, emit a PTE that references it */ -typedef uint64_t (*gipt_pte_map_cb_t)(uint64_t); - -struct gipt_cbs { - gipt_pte_type_cb_t giptc_pte_type; - gipt_pte_map_cb_t giptc_pte_map; -}; - -struct gipt_map { - kmutex_t giptm_lock; - gipt_t *giptm_root; - list_t *giptm_hash; - struct gipt_cbs giptm_cbs; - size_t giptm_table_cnt; - uint_t giptm_levels; -}; -typedef struct gipt_map gipt_map_t; - -#define GIPT_HASH_SIZE_DEFAULT 0x2000 -#define GIPT_MAX_LEVELS 4 - -#define GIPT_VA2IDX(pt, va) \ - (((va) - (pt)->gipt_vaddr) >> \ - gipt_level_shift[(pt)->gipt_level]) - -#define GIPT_VA2PTE(pt, va) ((pt)->gipt_kva[GIPT_VA2IDX(pt, va)]) -#define GIPT_VA2PTEP(pt, va) (&(pt)->gipt_kva[GIPT_VA2IDX(pt, va)]) - -extern const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1]; -extern const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1]; -extern const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1]; -extern const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1]; - -extern gipt_t *gipt_alloc(void); -extern void gipt_free(gipt_t *); -extern void gipt_map_init(gipt_map_t *, uint_t, uint_t, - const struct gipt_cbs *, gipt_t *); -extern void gipt_map_fini(gipt_map_t *); -extern gipt_t *gipt_map_lookup(gipt_map_t *, uint64_t, uint_t); -extern gipt_t *gipt_map_lookup_deepest(gipt_map_t *, uint64_t); -extern uint64_t gipt_map_next_page(gipt_map_t *, uint64_t, uint64_t, - gipt_t **); -extern void gipt_map_insert(gipt_map_t *, gipt_t *); -extern void gipt_map_remove(gipt_map_t *, gipt_t *); -extern gipt_t *gipt_map_create_parents(gipt_map_t *, uint64_t, uint_t); -extern void gipt_map_clean_parents(gipt_map_t *, gipt_t *); - -#endif /* _GIPT_H_ */ |
