summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorDan McDonald <danmcd@joyent.com>2021-07-27 01:17:26 -0400
committerDan McDonald <danmcd@joyent.com>2021-07-27 01:17:26 -0400
commit359c7fef8f9958b51fa2670e2e52e3d3c1eb9bf4 (patch)
tree2aedc787e8af163a45c617e337cf0538e8268289 /usr/src
parentbf777172e611181d7a2838b4e6b59d72913aa3ff (diff)
parentb4ceea05088ba1b5fae1914544a808623516aa80 (diff)
downloadillumos-joyent-359c7fef8f9958b51fa2670e2e52e3d3c1eb9bf4.tar.gz
[illumos-gate merge]
commit b4ceea05088ba1b5fae1914544a808623516aa80 13932 improve bhyve second level page table support 13862 EPT/RVI supports resetting A/D bits
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/i86pc/Makefile.files3
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h104
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h12
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_gpt.c558
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c321
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c369
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c1
-rw-r--r--usr/src/uts/i86pc/os/gipt.c568
-rw-r--r--usr/src/uts/i86pc/sys/gipt.h92
9 files changed, 965 insertions, 1063 deletions
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index fd74b0047f..4370e90d9a 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -26,6 +26,7 @@
# Copyright 2019 Joyent, Inc.
# Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
# Copyright 2019 Joyent, Inc.
+# Copyright 2021 Oxide Computer Company
#
# This Makefile defines file modules in the directory uts/i86pc
# and its children. These are the source files which are i86pc
@@ -269,7 +270,7 @@ VMM_OBJS += vmm.o \
vmcb.o \
svm_support.o \
amdv.o \
- gipt.o \
+ vmm_gpt.o \
vmm_sol_vm.o \
vmm_sol_glue.o \
vmm_sol_ept.o \
diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h
new file mode 100644
index 0000000000..554f51bbb6
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_gpt.h
@@ -0,0 +1,104 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#ifndef _VMM_GPT_H
+#define _VMM_GPT_H
+
+#include <sys/types.h>
+
+typedef struct vmm_pt_ops vmm_pt_ops_t;
+struct vmm_pt_ops {
+ void * (*vpo_init)(uint64_t *);
+ void (*vpo_free)(void *);
+ uint64_t (*vpo_wired_cnt)(void *);
+ int (*vpo_is_wired)(void *, uint64_t, uint_t *);
+ int (*vpo_map)(void *, uint64_t, pfn_t, uint_t, uint_t,
+ uint8_t);
+ uint64_t (*vpo_unmap)(void *, uint64_t, uint64_t);
+};
+
+extern struct vmm_pt_ops ept_ops;
+extern struct vmm_pt_ops rvi_ops;
+
+/*
+ * Constants for the nodes in the GPT radix tree. Note
+ * that, in accordance with hardware page table descriptions,
+ * the root of the tree is referred to as "LEVEL4" while the
+ * leaf level is "LEVEL1".
+ */
+enum vmm_gpt_node_level {
+ LEVEL4 = 0,
+ LEVEL3,
+ LEVEL2,
+ LEVEL1,
+ MAX_GPT_LEVEL,
+};
+
+/*
+ * The vmm_pte_ops structure contains function pointers for format-specific
+ * operations on page table entries. The operations are as follows:
+ *
+ * vpeo_map_table: Creates a PTE that maps an inner node in the page table.
+ * vpeo_map_page: Creates a leaf entry PTE that maps a page of physical memory.
+ * vpeo_pte_pfn: Returns the PFN contained in the given PTE.
+ * vpeo_pte_is_present: Returns true IFF the PTE maps a present page.
+ * vpeo_pte_prot: Returns a bitmask of protection bits for the PTE.
+ * The bits correspond to the standard mmap(2) bits: PROT_READ, PROT_WRITE,
+ * PROT_EXEC.
+ * vpeo_reset_dirty: Resets the dirty bit on the given PTE. If the second
+ * argument is `true`, the bit will be set, otherwise it will be cleared.
+ * Returns non-zero if the previous value of the bit was set.
+ * vpeo_reset_accessed: Resets the accessed bit on the given PTE. If the
+ * second argument is `true`, the bit will be set, otherwise it will be
+ * cleared. Returns non-zero if the previous value of the bit was set.
+ */
+typedef struct vmm_pte_ops vmm_pte_ops_t;
+struct vmm_pte_ops {
+ uint64_t (*vpeo_map_table)(pfn_t);
+ uint64_t (*vpeo_map_page)(pfn_t, uint_t, uint8_t);
+ pfn_t (*vpeo_pte_pfn)(uint64_t);
+ bool (*vpeo_pte_is_present)(uint64_t);
+ uint_t (*vpeo_pte_prot)(uint64_t);
+ uint_t (*vpeo_reset_dirty)(uint64_t *, bool);
+ uint_t (*vpeo_reset_accessed)(uint64_t *, bool);
+};
+
+struct vmm_gpt;
+typedef struct vmm_gpt vmm_gpt_t;
+
+vmm_gpt_t *ept_create(void);
+vmm_gpt_t *rvi_create(void);
+
+vmm_gpt_t *vmm_gpt_alloc(vmm_pte_ops_t *);
+void vmm_gpt_free(vmm_gpt_t *);
+
+void *vmm_gpt_root_kaddr(vmm_gpt_t *);
+pfn_t vmm_gpt_root_pfn(vmm_gpt_t *);
+uint64_t *vmm_gpt_lookup(vmm_gpt_t *, uint64_t);
+void vmm_gpt_walk(vmm_gpt_t *, uint64_t, uint64_t **, enum vmm_gpt_node_level);
+void vmm_gpt_populate_entry(vmm_gpt_t *, uint64_t);
+void vmm_gpt_populate_region(vmm_gpt_t *, uint64_t, uint64_t);
+void vmm_gpt_vacate_region(vmm_gpt_t *, uint64_t, uint64_t);
+bool vmm_gpt_map(vmm_gpt_t *, uint64_t, pfn_t, uint_t, uint8_t);
+bool vmm_gpt_unmap(vmm_gpt_t *, uint64_t);
+size_t vmm_gpt_unmap_region(vmm_gpt_t *, uint64_t, uint64_t);
+
+bool vmm_gpt_is_mapped(vmm_gpt_t *, uint64_t, uint_t *);
+size_t vmm_gpt_mapped_count(vmm_gpt_t *);
+uint_t vmm_gpt_reset_accessed(vmm_gpt_t *, uint64_t *, bool);
+uint_t vmm_gpt_reset_dirty(vmm_gpt_t *, uint64_t *, bool);
+
+#endif /* _VMM_GPT_H */
diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h
index 43249a6ac7..6c7f9d423e 100644
--- a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h
@@ -163,18 +163,6 @@ void *vmspace_find_kva(struct vmspace *, uintptr_t, size_t);
void vmm_arena_init(void);
void vmm_arena_fini(void);
-struct vmm_pt_ops {
- void * (*vpo_init)(uint64_t *);
- void (*vpo_free)(void *);
- uint64_t (*vpo_wired_cnt)(void *);
- int (*vpo_is_wired)(void *, uint64_t, uint_t *);
- int (*vpo_map)(void *, uint64_t, pfn_t, uint_t, uint_t, uint8_t);
- uint64_t (*vpo_unmap)(void *, uint64_t, uint64_t);
-};
-
-extern struct vmm_pt_ops ept_ops;
-extern struct vmm_pt_ops rvi_ops;
-
typedef int (*pmap_pinit_t)(struct pmap *pmap);
struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t);
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_gpt.c b/usr/src/uts/i86pc/io/vmm/vmm_gpt.c
new file mode 100644
index 0000000000..6624e0fa6d
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_gpt.c
@@ -0,0 +1,558 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/atomic.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/sunddi.h>
+#include <sys/panic.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/hat_i86.h>
+
+#include <sys/vmm_gpt.h>
+
+/*
+ * VMM Generic Page Tables
+ *
+ * Bhyve runs on AMD and Intel hosts and both support nested page tables
+ * describing the guest's physical address space. But the two use different and
+ * mutually incompatible page table formats: Intel uses the EPT, which is based
+ * on the Itanium page table format, while AMD uses the nPT, which is based on
+ * the x86_64 page table format.
+ *
+ * The GPT abstracts these format differences, and provides a single interface
+ * for interacting with either kind of table structure.
+ *
+ * At a high-level, the GPT is a tree that mirrors the paging table radix tree.
+ * It is parameterized with operations on PTEs that are specific to the table
+ * type (EPT or nPT) and also keeps track of how many pages the table maps, as
+ * well as a pointer to the root node in the tree.
+ *
+ * A node in the GPT keep pointers to its parent (NULL for the root), its
+ * left-most child, and its rightward siblings. The node understands its
+ * position in the tree in terms of its level it appears at and the index it
+ * occupies at its parent's level, as well as how many children it has. It also
+ * owns the physical memory page for the hardware page table entries that map
+ * its children. Thus, for a node at any given level in the tree, the nested
+ * PTE for that node's child at index $i$ is the i'th uint64_t in that node's
+ * entry page and the entry page is part of the paging structure consumed by
+ * hardware.
+ *
+ * The GPT interface provides functions for populating and vacating the tree for
+ * regions in the guest physical address space, and for mapping and unmapping
+ * pages in populated regions. Users must populate a region before mapping
+ * pages into it, and must unmap pages before vacating the region.
+ *
+ * The interface also exposes a function for walking the table from the root to
+ * a leaf entry, populating an array of pointers to PTEs. This walk uses the
+ * hardware page structure itself, and is thus fast, though as a result it
+ * potentially aliases entries; caveat emptor. The walk primitive is used for
+ * mapping, unmapping, and lookups.
+ *
+ * Format-specific differences are abstracted by parameterizing the GPT with a
+ * set of PTE operations specific to the platform. The GPT code makes use of
+ * these when mapping or populating entries, resetting accessed and dirty bits
+ * on entries, and similar operations.
+ */
+
+/*
+ * A GPT node.
+ *
+ * Each node contains pointers to its parent, its left-most child, and its
+ * rightward siblings. Interior nodes also maintain a reference count, and
+ * each node contains its level and index in its parent's table. Finally,
+ * each node contains the host PFN of the page that it links into the page
+ * table, as well as a kernel pointer to table.
+ *
+ * Note, this is carefully sized to fit exactly into a 64-byte cache line.
+ */
+typedef struct vmm_gpt_node vmm_gpt_node_t;
+struct vmm_gpt_node {
+ uint64_t vgn_host_pfn;
+ uint16_t vgn_level;
+ uint16_t vgn_index;
+ uint32_t vgn_ref_cnt;
+ vmm_gpt_node_t *vgn_parent;
+ vmm_gpt_node_t *vgn_children;
+ vmm_gpt_node_t *vgn_siblings;
+ uint64_t *vgn_entries;
+ uint64_t _vgn_pad[2];
+};
+
+/*
+ * A VMM Generic Page Table.
+ *
+ * The generic page table is a format-agnostic, 4-level paging structure
+ * modeling a second-level page table (EPT on Intel, nPT on AMD). It
+ * contains a counter of pages the table maps, a pointer to the root node
+ * in the table, and is parameterized with a set of PTE operations specific
+ * to the table type.
+ */
+struct vmm_gpt {
+ vmm_gpt_node_t *vgpt_root;
+ vmm_pte_ops_t *vgpt_pte_ops;
+ uint64_t vgpt_mapped_page_count;
+};
+
+/*
+ * VMM Guest Page Tables
+ */
+
+/*
+ * Allocates a vmm_gpt_node_t structure with corresponding page of memory to
+ * hold the PTEs it contains.
+ */
+static vmm_gpt_node_t *
+vmm_gpt_node_alloc(void)
+{
+ vmm_gpt_node_t *node;
+ caddr_t page;
+
+ node = kmem_zalloc(sizeof (*node), KM_SLEEP);
+ /*
+ * Note: despite the man page, allocating PAGESIZE bytes is
+ * guaranteed to be page-aligned.
+ */
+ page = kmem_zalloc(PAGESIZE, KM_SLEEP);
+ node->vgn_entries = (uint64_t *)page;
+ node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page);
+
+ return (node);
+}
+
+/*
+ * Allocates and initializes a vmm_gpt_t.
+ */
+vmm_gpt_t *
+vmm_gpt_alloc(vmm_pte_ops_t *pte_ops)
+{
+ vmm_gpt_t *gpt;
+
+ VERIFY(pte_ops != NULL);
+ gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP);
+ gpt->vgpt_pte_ops = pte_ops;
+ gpt->vgpt_root = vmm_gpt_node_alloc();
+
+ return (gpt);
+}
+
+/*
+ * Retrieves the host kernel address of the GPT root.
+ */
+void *
+vmm_gpt_root_kaddr(vmm_gpt_t *gpt)
+{
+ return (gpt->vgpt_root->vgn_entries);
+}
+
+/*
+ * Retrieves the host PFN of the GPT root.
+ */
+uint64_t
+vmm_gpt_root_pfn(vmm_gpt_t *gpt)
+{
+ return (gpt->vgpt_root->vgn_host_pfn);
+}
+
+/*
+ * Frees the given node, first nulling out all of its links to other nodes in
+ * the tree, adjusting its parents reference count, and unlinking itself from
+ * its parents page table.
+ */
+static void
+vmm_gpt_node_free(vmm_gpt_node_t *node)
+{
+ ASSERT(node != NULL);
+ ASSERT3U(node->vgn_ref_cnt, ==, 0);
+ ASSERT(node->vgn_host_pfn != PFN_INVALID);
+ ASSERT(node->vgn_entries != NULL);
+ if (node->vgn_parent != NULL) {
+ uint64_t *parent_entries = node->vgn_parent->vgn_entries;
+ parent_entries[node->vgn_index] = 0;
+ node->vgn_parent->vgn_ref_cnt--;
+ }
+ kmem_free(node->vgn_entries, PAGESIZE);
+ kmem_free(node, sizeof (*node));
+}
+
+/*
+ * Frees the portion of the radix tree rooted at the given node.
+ */
+static void
+vmm_gpt_node_tree_free(vmm_gpt_node_t *node)
+{
+ ASSERT(node != NULL);
+
+ for (vmm_gpt_node_t *child = node->vgn_children, *next = NULL;
+ child != NULL;
+ child = next) {
+ next = child->vgn_siblings;
+ vmm_gpt_node_tree_free(child);
+ }
+ vmm_gpt_node_free(node);
+}
+
+/*
+ * Cleans up a vmm_gpt_t by removing any lingering vmm_gpt_node_t entries
+ * it refers to.
+ */
+void
+vmm_gpt_free(vmm_gpt_t *gpt)
+{
+ vmm_gpt_node_tree_free(gpt->vgpt_root);
+ kmem_free(gpt, sizeof (*gpt));
+}
+
+/*
+ * Return the index in the paging structure for the given level.
+ */
+static inline uint16_t
+vmm_gpt_node_index(uint64_t gpa, enum vmm_gpt_node_level level)
+{
+ const int SHIFTS[MAX_GPT_LEVEL] = { 39, 30, 21, 12 };
+ const uint_t MASK = (1U << 9) - 1;
+ ASSERT(level < MAX_GPT_LEVEL);
+ return ((gpa >> SHIFTS[level]) & MASK);
+}
+
+/*
+ * Finds the child for the given GPA in the given parent node.
+ * Returns a pointer to node, or NULL if it is not found.
+ */
+static vmm_gpt_node_t *
+vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa)
+{
+ if (parent == NULL)
+ return (NULL);
+
+ const uint16_t index = vmm_gpt_node_index(gpa, parent->vgn_level);
+ for (vmm_gpt_node_t *child = parent->vgn_children;
+ child != NULL && child->vgn_index <= index;
+ child = child->vgn_siblings) {
+ if (child->vgn_index == index)
+ return (child);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Walks the GPT for the given GPA, accumulating entries to the given depth. If
+ * the walk terminates before the depth is reached, the remaining entries are
+ * written with NULLs.
+ */
+void
+vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries,
+ enum vmm_gpt_node_level depth)
+{
+ uint64_t *current_entries, entry;
+ pfn_t pfn;
+
+ ASSERT(gpt != NULL);
+ current_entries = gpt->vgpt_root->vgn_entries;
+ for (uint_t i = 0; i < depth; i++) {
+ if (current_entries == NULL) {
+ entries[i] = NULL;
+ continue;
+ }
+ entries[i] = &current_entries[vmm_gpt_node_index(gpa, i)];
+ entry = *entries[i];
+ if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
+ current_entries = NULL;
+ continue;
+ }
+ pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
+ current_entries = (uint64_t *)hat_kpm_pfn2va(pfn);
+ }
+}
+
+/*
+ * Looks up an entry given GPA.
+ */
+uint64_t *
+vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa)
+{
+ uint64_t *entries[MAX_GPT_LEVEL];
+
+ vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
+
+ return (entries[LEVEL1]);
+}
+
+/*
+ * Adds a node for the given GPA to the GPT as a child of the given parent.
+ */
+static void
+vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child,
+ uint64_t gpa)
+{
+ vmm_gpt_node_t **prevp;
+ vmm_gpt_node_t *node;
+ uint64_t *parent_entries, entry;
+
+ ASSERT(gpt != NULL);
+ ASSERT(gpt->vgpt_pte_ops != NULL);
+ ASSERT(parent != NULL);
+ ASSERT(child != NULL);
+
+ const int index = vmm_gpt_node_index(gpa, parent->vgn_level);
+ child->vgn_index = index;
+ child->vgn_level = parent->vgn_level + 1;
+ child->vgn_parent = parent;
+ parent_entries = parent->vgn_entries;
+ entry = gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn);
+ parent_entries[index] = entry;
+
+ for (prevp = &parent->vgn_children, node = parent->vgn_children;
+ node != NULL;
+ prevp = &node->vgn_siblings, node = node->vgn_siblings) {
+ if (node->vgn_index > child->vgn_index) {
+ break;
+ }
+ }
+ if (node != NULL)
+ ASSERT3U(node->vgn_index, !=, child->vgn_index);
+ child->vgn_siblings = node;
+ *prevp = child;
+ parent->vgn_ref_cnt++;
+}
+
+/*
+ * Populate the GPT with nodes so that a entries for the given GPA exist. Note
+ * that this does not actually map the entry, but simply ensures that the
+ * entries exist.
+ */
+void
+vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa)
+{
+ vmm_gpt_node_t *node, *child;
+
+ ASSERT(gpt != NULL);
+ node = gpt->vgpt_root;
+ for (uint_t i = 0; i < LEVEL1; i++) {
+ ASSERT(node != NULL);
+ child = vmm_gpt_node_find_child(node, gpa);
+ if (child == NULL) {
+ child = vmm_gpt_node_alloc();
+ ASSERT(child != NULL);
+ vmm_gpt_add_child(gpt, node, child, gpa);
+ }
+ node = child;
+ }
+}
+
+/*
+ * Ensures that PTEs for the region of address space bounded by
+ * [start, end] exist in the tree.
+ */
+void
+vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
+{
+ for (uint64_t page = start; page <= end; page += PAGESIZE) {
+ vmm_gpt_populate_entry(gpt, page);
+ }
+}
+
+/*
+ * Inserts an entry for a given GPA into the table. The caller must
+ * ensure that the entry is not currently mapped, though note that this
+ * can race with another thread inserting the same page into the tree.
+ * If we lose the race, we ensure that the page we thought we were
+ * inserting is the page that was inserted.
+ */
+bool
+vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr)
+{
+ uint64_t *entries[MAX_GPT_LEVEL], entry, old_entry;
+
+ ASSERT(gpt != NULL);
+ vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
+ ASSERT(entries[LEVEL1] != NULL);
+
+ entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr);
+ old_entry = atomic_cas_64(entries[LEVEL1], 0, entry);
+ if (old_entry != 0) {
+ ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry),
+ ==,
+ gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry));
+ return (false);
+ }
+ gpt->vgpt_mapped_page_count++;
+
+ return (true);
+}
+
+/*
+ * Removes a child node from its parent's list of children, and then frees
+ * the now-orphaned child.
+ */
+static void
+vmm_gpt_node_remove_child(vmm_gpt_node_t *parent, vmm_gpt_node_t *child)
+{
+ ASSERT(parent != NULL);
+
+ ASSERT3P(child->vgn_children, ==, NULL);
+ vmm_gpt_node_t **prevp = &parent->vgn_children;
+ for (vmm_gpt_node_t *node = parent->vgn_children;
+ node != NULL;
+ prevp = &node->vgn_siblings, node = node->vgn_siblings) {
+ if (node == child) {
+ *prevp = node->vgn_siblings;
+ vmm_gpt_node_free(node);
+ break;
+ }
+ }
+}
+
+/*
+ * Cleans up unused inner nodes in the GPT. Asserts that the
+ * leaf corresponding to the entry does not map any additional
+ * pages.
+ */
+static void
+vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa)
+{
+ vmm_gpt_node_t *nodes[MAX_GPT_LEVEL], *node;
+
+ node = gpt->vgpt_root;
+ for (uint_t i = 0; i < MAX_GPT_LEVEL; i++) {
+ nodes[i] = node;
+ node = vmm_gpt_node_find_child(node, gpa);
+ }
+ if (nodes[LEVEL1] != NULL) {
+ uint64_t *ptes = nodes[LEVEL1]->vgn_entries;
+ for (uint_t i = 0; i < (PAGESIZE / sizeof (uint64_t)); i++)
+ ASSERT3U(ptes[i], ==, 0);
+ }
+ for (uint_t i = LEVEL1; i > 0; i--) {
+ if (nodes[i] == NULL)
+ continue;
+ if (nodes[i]->vgn_ref_cnt != 0)
+ break;
+ vmm_gpt_node_remove_child(nodes[i - 1], nodes[i]);
+ }
+}
+
+/*
+ * Cleans up the unused inner nodes in the GPT for a region of guest
+ * physical address space bounded by `start` and `end`. The region
+ * must map no pages.
+ */
+void
+vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
+{
+ for (uint64_t page = start; page <= end; page += PAGESIZE) {
+ vmm_gpt_vacate_entry(gpt, page);
+ }
+}
+
+/*
+ * Remove a mapping from the table. Returns false if the page was not
+ * mapped, otherwise returns true.
+ */
+bool
+vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
+{
+ uint64_t *entries[MAX_GPT_LEVEL], entry;
+ bool was_mapped;
+
+ ASSERT(gpt != NULL);
+ vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
+ if (entries[LEVEL1] == NULL)
+ return (false);
+
+ entry = *entries[LEVEL1];
+ *entries[LEVEL1] = 0;
+ was_mapped = gpt->vgpt_pte_ops->vpeo_pte_is_present(entry);
+ if (was_mapped)
+ gpt->vgpt_mapped_page_count--;
+
+ return (was_mapped);
+}
+
+/*
+ * Un-maps the region of guest physical address space bounded by
+ * start and end. Returns the number of pages that are unmapped.
+ */
+size_t
+vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
+{
+ size_t n = 0;
+
+ for (uint64_t page = start; page <= end; page += PAGESIZE) {
+ if (vmm_gpt_unmap(gpt, page) != 0)
+ n++;
+ }
+
+ return (n);
+}
+
+/*
+ * Returns a value indicating whether or not this GPT maps the given
+ * GPA. If the GPA is mapped, *protp will be filled with the protection
+ * bits of the entry. Otherwise, it will be ignored.
+ */
+bool
+vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t gpa, uint_t *protp)
+{
+ uint64_t *entries[MAX_GPT_LEVEL], entry;
+
+ vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
+ if (entries[LEVEL1] == NULL)
+ return (false);
+ entry = *entries[LEVEL1];
+ if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry))
+ return (false);
+ *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry);
+
+ return (true);
+}
+
+/*
+ * Returns the number of pages that are mapped in by this GPT.
+ */
+size_t
+vmm_gpt_mapped_count(vmm_gpt_t *gpt)
+{
+ return (gpt->vgpt_mapped_page_count);
+}
+
+/*
+ * Resets the accessed bit on the page table entry pointed to be `entry`.
+ * If `on` is true, the bit will be set, otherwise it will be cleared.
+ * The old value of the bit is returned.
+ */
+uint_t
+vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on)
+{
+ ASSERT(entry != NULL);
+ return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on));
+}
+
+/*
+ * Resets the dirty bit on the page table entry pointed to be `entry`.
+ * If `on` is true, the bit will be set, otherwise it will be cleared.
+ * The old value of the bit is returned.
+ */
+uint_t
+vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on)
+{
+ ASSERT(entry != NULL);
+ return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on));
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c
index b43a6cac1d..3d357f37d2 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c
@@ -17,31 +17,28 @@
#include <sys/types.h>
#include <sys/param.h>
+#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/machsystm.h>
#include <sys/mman.h>
-#include <sys/gipt.h>
+#include <sys/vmm_gpt.h>
#include <sys/vmm_vm.h>
+typedef struct ept_map ept_map_t;
struct ept_map {
- gipt_map_t em_gipt;
- uint64_t em_wired_page_count;
+ vmm_gpt_t *em_gpt;
+ kmutex_t em_lock;
};
-typedef struct ept_map ept_map_t;
-
-#define EPT_LOCK(m) (&(m)->em_gipt.giptm_lock)
-#define EPT_MAX_LEVELS 4
-
-CTASSERT(EPT_MAX_LEVELS <= GIPT_MAX_LEVELS);
-
-#define EPT_R (0x1 << 0)
-#define EPT_W (0x1 << 1)
-#define EPT_X (0x1 << 2)
+#define EPT_R (1 << 0)
+#define EPT_W (1 << 1)
+#define EPT_X (1 << 2)
#define EPT_RWX (EPT_R | EPT_W | EPT_X)
-#define EPT_LGPG (0x1 << 7)
+#define EPT_LGPG (1 << 7)
+#define EPT_ACCESSED (1 << 8)
+#define EPT_DIRTY (1 << 9)
#define EPT_PA_MASK (0x000ffffffffff000ull)
@@ -49,223 +46,183 @@ CTASSERT(EPT_R == PROT_READ);
CTASSERT(EPT_W == PROT_WRITE);
CTASSERT(EPT_X == PROT_EXEC);
+static uint_t
+ept_pte_prot(uint64_t pte)
+{
+ return (pte & EPT_RWX);
+}
-#define EPT_PAT(attr) (((attr) & 0x7) << 3)
-#define EPT_PADDR(addr) ((addr) & EPT_PA_MASK)
+static inline uint64_t
+ept_attr_to_pat(uint8_t attr)
+{
+ uint64_t bits = attr & 0x7;
+ return (bits << 3);
+}
-#define EPT_IS_ABSENT(pte) (((pte) & EPT_RWX) == 0)
-#define EPT_PTE_PFN(pte) mmu_btop(EPT_PADDR(pte))
-#define EPT_PTE_PROT(pte) ((pte) & EPT_RWX)
-#define EPT_MAPS_PAGE(pte, lvl) \
- (EPT_PTE_PROT(pte) != 0 && (((pte) & EPT_LGPG) != 0 || (lvl) == 0))
+static uint64_t
+ept_map_table(uint64_t pfn)
+{
+ const uint64_t paddr = pfn_to_pa(pfn) & EPT_PA_MASK;
+ return (paddr | EPT_RWX);
+}
-/*
- * Only assign EPT_LGPG for levels higher than 0. Although this bit is defined
- * as being ignored at level 0, some versions of VMWare fail to honor this and
- * report such a PTE as an EPT mis-configuration.
- */
-#define EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \
- (EPT_PADDR(pfn_to_pa(pfn)) | \
- (((lvl) != 0) ? EPT_LGPG : 0) | \
- EPT_PAT(attr) | ((prot) & EPT_RWX))
-#define EPT_PTE_ASSIGN_TABLE(pfn) (EPT_PADDR(pfn_to_pa(pfn)) | EPT_RWX)
+static uint64_t
+ept_map_page(uint64_t pfn, uint_t prot, uint8_t attr)
+{
+ const uint64_t paddr = pfn_to_pa(pfn) & EPT_PA_MASK;
+ const uint64_t pat = ept_attr_to_pat(attr);
+ const uint64_t rprot = prot & EPT_RWX;
+ return (paddr | pat | rprot);
+}
+static uint64_t
+ept_pte_pfn(uint64_t pte)
+{
+ return (mmu_btop(pte & PT_PADDR));
+}
-static gipt_pte_type_t
-ept_pte_type(uint64_t pte, uint_t level)
+static bool
+ept_pte_is_present(uint64_t pte)
{
- if (EPT_IS_ABSENT(pte)) {
- return (PTET_EMPTY);
- } else if (EPT_MAPS_PAGE(pte, level)) {
- return (PTET_PAGE);
- } else {
- return (PTET_LINK);
- }
+ return ((pte & EPT_RWX) != 0);
}
-static uint64_t
-ept_pte_map(uint64_t pfn)
+static uint_t
+ept_reset_bits(volatile uint64_t *entry, uint64_t mask, uint64_t bits)
{
- return (EPT_PTE_ASSIGN_TABLE(pfn));
+ uint64_t pte, newpte, oldpte = 0;
+
+ /*
+ * We use volatile and atomic ops here because we may be
+ * racing against hardware modifying these bits.
+ */
+ VERIFY3P(entry, !=, NULL);
+ oldpte = *entry;
+ do {
+ pte = oldpte;
+ newpte = (pte & ~mask) | bits;
+ oldpte = atomic_cas_64(entry, pte, newpte);
+ } while (oldpte != pte);
+
+ return (oldpte & mask);
+}
+
+static uint_t
+ept_reset_dirty(uint64_t *entry, bool on)
+{
+ return (ept_reset_bits(entry, EPT_DIRTY,
+ on ? (EPT_DIRTY | EPT_ACCESSED) : 0));
+}
+
+static uint_t
+ept_reset_accessed(uint64_t *entry, bool on)
+{
+ return (ept_reset_bits(entry, EPT_DIRTY | EPT_ACCESSED,
+ on ? EPT_ACCESSED : 0));
+}
+
+static vmm_pte_ops_t ept_pte_ops = {
+ .vpeo_map_table = ept_map_table,
+ .vpeo_map_page = ept_map_page,
+ .vpeo_pte_pfn = ept_pte_pfn,
+ .vpeo_pte_is_present = ept_pte_is_present,
+ .vpeo_pte_prot = ept_pte_prot,
+ .vpeo_reset_dirty = ept_reset_dirty,
+ .vpeo_reset_accessed = ept_reset_accessed,
+};
+
+vmm_gpt_t *
+ept_create(void)
+{
+ return (vmm_gpt_alloc(&ept_pte_ops));
}
static void *
-ept_create(uintptr_t *pml4_kaddr)
+ept_ops_create(uintptr_t *root_kaddr)
{
- ept_map_t *emap;
- gipt_map_t *map;
- gipt_t *root;
- struct gipt_cbs cbs = {
- .giptc_pte_type = ept_pte_type,
- .giptc_pte_map = ept_pte_map,
- };
-
- emap = kmem_zalloc(sizeof (*emap), KM_SLEEP);
- map = &emap->em_gipt;
- root = gipt_alloc();
- root->gipt_level = EPT_MAX_LEVELS - 1;
- gipt_map_init(map, EPT_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root);
-
- *pml4_kaddr = (uintptr_t)root->gipt_kva;
- return (emap);
+ ept_map_t *map;
+
+ map = kmem_zalloc(sizeof (*map), KM_SLEEP);
+ mutex_init(&map->em_lock, NULL, MUTEX_DEFAULT, NULL);
+ map->em_gpt = ept_create();
+ *root_kaddr = (uintptr_t)vmm_gpt_root_kaddr(map->em_gpt);
+
+ return (map);
}
static void
-ept_destroy(void *arg)
+ept_ops_destroy(void *arg)
{
- ept_map_t *emap = arg;
-
- if (emap != NULL) {
- gipt_map_t *map = &emap->em_gipt;
+ ept_map_t *map = arg;
- gipt_map_fini(map);
- kmem_free(emap, sizeof (*emap));
+ if (map != NULL) {
+ vmm_gpt_free(map->em_gpt);
+ mutex_destroy(&map->em_lock);
+ kmem_free(map, sizeof (*map));
}
}
static uint64_t
-ept_wired_count(void *arg)
+ept_ops_wired_count(void *arg)
{
- ept_map_t *emap = arg;
+ ept_map_t *map = arg;
uint64_t res;
- mutex_enter(EPT_LOCK(emap));
- res = emap->em_wired_page_count;
- mutex_exit(EPT_LOCK(emap));
+ mutex_enter(&map->em_lock);
+ res = vmm_gpt_mapped_count(map->em_gpt);
+ mutex_exit(&map->em_lock);
return (res);
}
static int
-ept_is_wired(void *arg, uint64_t va, uint_t *protp)
+ept_ops_is_wired(void *arg, uint64_t gpa, uint_t *protp)
{
- ept_map_t *emap = arg;
- gipt_t *pt;
- int rv = -1;
-
- mutex_enter(EPT_LOCK(emap));
- pt = gipt_map_lookup_deepest(&emap->em_gipt, va);
- if (pt != NULL) {
- const uint64_t pte = GIPT_VA2PTE(pt, va);
-
- if (EPT_MAPS_PAGE(pte, pt->gipt_level)) {
- *protp = EPT_PTE_PROT(pte);
- rv = 0;
- }
- }
- mutex_exit(EPT_LOCK(emap));
+ ept_map_t *map = arg;
+ bool mapped;
+
+ mutex_enter(&map->em_lock);
+ mapped = vmm_gpt_is_mapped(map->em_gpt, gpa, protp);
+ mutex_exit(&map->em_lock);
- return (rv);
+ return (mapped ? 0 : -1);
}
static int
-ept_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot,
+ept_ops_map(void *arg, uint64_t gpa, pfn_t pfn, uint_t _lvl, uint_t prot,
uint8_t attr)
{
- ept_map_t *emap = arg;
- gipt_map_t *map = &emap->em_gipt;
- gipt_t *pt;
- uint64_t *ptep, pte;
+ ept_map_t *map = arg;
ASSERT((prot & EPT_RWX) != 0 && (prot & ~EPT_RWX) == 0);
- ASSERT3U(lvl, <, EPT_MAX_LEVELS);
-
- mutex_enter(EPT_LOCK(emap));
- pt = gipt_map_lookup(map, va, lvl);
- if (pt == NULL) {
- /*
- * A table at the appropriate VA/level that would house this
- * mapping does not currently exist. Try to walk down to that
- * point, creating any necessary parent(s).
- */
- pt = gipt_map_create_parents(map, va, lvl);
-
- /*
- * There was a large page mapping in the way of creating the
- * necessary parent table(s).
- */
- if (pt == NULL) {
- panic("unexpected large page @ %08lx", va);
- }
- }
- ptep = GIPT_VA2PTEP(pt, va);
-
- pte = *ptep;
- if (!EPT_IS_ABSENT(pte)) {
- if (!EPT_MAPS_PAGE(pte, lvl)) {
- panic("unexpected PT link @ %08lx in %p", va, pt);
- } else {
- panic("unexpected page mapped @ %08lx in %p", va, pt);
- }
- }
- pte = EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr);
- *ptep = pte;
- pt->gipt_valid_cnt++;
- emap->em_wired_page_count += gipt_level_count[lvl];
+ mutex_enter(&map->em_lock);
+ vmm_gpt_populate_entry(map->em_gpt, gpa);
+ (void) vmm_gpt_map(map->em_gpt, gpa, pfn, prot, attr);
+ mutex_exit(&map->em_lock);
- mutex_exit(EPT_LOCK(emap));
return (0);
}
static uint64_t
-ept_unmap(void *arg, uint64_t va, uint64_t end_va)
+ept_ops_unmap(void *arg, uint64_t start, uint64_t end)
{
- ept_map_t *emap = arg;
- gipt_map_t *map = &emap->em_gipt;
- gipt_t *pt;
- uint64_t cur_va = va;
- uint64_t unmapped = 0;
-
- mutex_enter(EPT_LOCK(emap));
-
- pt = gipt_map_lookup_deepest(map, cur_va);
- if (pt == NULL) {
- mutex_exit(EPT_LOCK(emap));
- return (0);
- }
- if (!EPT_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) {
- cur_va = gipt_map_next_page(map, cur_va, end_va, &pt);
- if (cur_va == 0) {
- mutex_exit(EPT_LOCK(emap));
- return (0);
- }
- }
-
- while (cur_va < end_va) {
- uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va);
- const uint_t lvl = pt->gipt_level;
-
- ASSERT(EPT_MAPS_PAGE(*ptep, lvl));
- *ptep = 0;
- pt->gipt_valid_cnt--;
- unmapped += gipt_level_count[pt->gipt_level];
-
- gipt_t *next_pt = pt;
- uint64_t next_va;
- next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt);
-
- if (pt->gipt_valid_cnt == 0) {
- gipt_map_clean_parents(map, pt);
- }
- if (next_va == 0) {
- break;
- }
- pt = next_pt;
- cur_va = next_va;
- }
- emap->em_wired_page_count -= unmapped;
+ ept_map_t *map = arg;
+ size_t unmapped = 0;
- mutex_exit(EPT_LOCK(emap));
+ mutex_enter(&map->em_lock);
+ unmapped = vmm_gpt_unmap_region(map->em_gpt, start, end);
+ vmm_gpt_vacate_region(map->em_gpt, start, end);
+ mutex_exit(&map->em_lock);
- return (unmapped);
+ return ((uint64_t)unmapped);
}
struct vmm_pt_ops ept_ops = {
- .vpo_init = ept_create,
- .vpo_free = ept_destroy,
- .vpo_wired_cnt = ept_wired_count,
- .vpo_is_wired = ept_is_wired,
- .vpo_map = ept_map,
- .vpo_unmap = ept_unmap,
+ .vpo_init = ept_ops_create,
+ .vpo_free = ept_ops_destroy,
+ .vpo_wired_cnt = ept_ops_wired_count,
+ .vpo_is_wired = ept_ops_is_wired,
+ .vpo_map = ept_ops_map,
+ .vpo_unmap = ept_ops_unmap,
};
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
index f82ea64994..c66a4e7962 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
@@ -17,284 +17,237 @@
#include <sys/types.h>
#include <sys/param.h>
+#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/machsystm.h>
#include <sys/mach_mmu.h>
#include <sys/mman.h>
#include <sys/x86_archext.h>
+#include <vm/hat_pte.h>
-#include <sys/gipt.h>
+#include <sys/vmm_gpt.h>
#include <sys/vmm_vm.h>
+typedef struct rvi_map rvi_map_t;
struct rvi_map {
- gipt_map_t rm_gipt;
- uint64_t rm_wired_page_count;
+ vmm_gpt_t *rm_gpt;
+ kmutex_t rm_lock;
};
-typedef struct rvi_map rvi_map_t;
-#define RVI_LOCK(m) (&(m)->rm_gipt.giptm_lock)
-
-#define RVI_MAX_LEVELS 4
-
-CTASSERT(RVI_MAX_LEVELS <= GIPT_MAX_LEVELS);
-
-#define RVI_PRESENT PT_VALID
-#define RVI_WRITABLE PT_WRITABLE
-#define RVI_ACCESSED PT_REF
-#define RVI_DIRTY PT_MOD
-#define RVI_LGPG PT_PAGESIZE
-#define RVI_NX PT_NX
-#define RVI_USER PT_USER
-#define RVI_PWT PT_WRITETHRU
-#define RVI_PCD PT_NOCACHE
-
-#define RVI_PA_MASK PT_PADDR
-
-#define RVI_PAT(attr) rvi_attr_to_pat(attr)
-#define RVI_PADDR(addr) ((addr) & RVI_PA_MASK)
-#define RVI_PROT(prot) \
- ((((prot) & PROT_WRITE) != 0 ? RVI_WRITABLE : 0) | \
- (((prot) & PROT_EXEC) == 0 ? RVI_NX : 0))
-
-#define RVI_IS_ABSENT(pte) (((pte) & RVI_PRESENT) == 0)
-#define RVI_PTE_PFN(pte) mmu_btop(RVI_PADDR(pte))
-#define RVI_MAPS_PAGE(pte, lvl) \
- (!RVI_IS_ABSENT(pte) && (((pte) & RVI_LGPG) != 0 || (lvl) == 0))
-#define RVI_PTE_PROT(pte) \
- (RVI_IS_ABSENT(pte) ? 0 : ( \
- PROT_READ | \
- (((pte) & RVI_NX) == 0 ? PROT_EXEC : 0) | \
- (((pte) & RVI_WRITABLE) != 0 ? PROT_WRITE : 0)))
-
-#define RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \
- (RVI_PADDR(pfn_to_pa(pfn)) | \
- (((lvl) != 0) ? RVI_LGPG : 0) | \
- RVI_USER | RVI_ACCESSED | RVI_PRESENT | \
- RVI_PAT(attr) | \
- RVI_PROT(prot))
-
-#define RVI_PTE_ASSIGN_TABLE(pfn) \
- (RVI_PADDR(pfn_to_pa(pfn)) | \
- RVI_USER | RVI_ACCESSED | RVI_PRESENT | \
- RVI_PAT(MTRR_TYPE_WB) | \
- RVI_PROT(PROT_READ | PROT_WRITE | PROT_EXEC))
+static inline uint64_t
+rvi_prot(uint_t prot)
+{
+ uint64_t bits;
+
+ bits = 0;
+ if ((prot & PROT_WRITE) != 0)
+ bits |= PT_WRITABLE;
+ if ((prot & PROT_EXEC) == 0)
+ bits |= PT_NX;
+
+ return (bits);
+}
+static uint_t
+rvi_pte_prot(uint64_t pte)
+{
+ uint_t prot;
+
+ if ((pte & PT_VALID) == 0)
+ return (0);
+
+ prot = PROT_READ;
+ if ((pte & PT_NX) == 0)
+ prot |= PROT_EXEC;
+ if ((pte & PT_WRITABLE) != 0)
+ prot |= PROT_WRITE;
+
+ return (prot);
+}
/* Make sure that PAT indexes line up as expected */
CTASSERT((PAT_DEFAULT_ATTRIBUTE & 0xf) == MTRR_TYPE_WB);
CTASSERT(((PAT_DEFAULT_ATTRIBUTE >> 24) & 0xf) == MTRR_TYPE_UC);
static inline uint64_t
-rvi_attr_to_pat(const uint8_t attr)
+rvi_attr_to_pat(uint8_t attr)
{
- if (attr == MTRR_TYPE_UC) {
- /* !PAT + PCD + PWT -> PAT3 -> MTRR_TYPE_UC */
- return (RVI_PCD|RVI_PWT);
- } else if (attr == MTRR_TYPE_WB) {
- /* !PAT + !PCD + !PWT -> PAT0 -> MTRR_TYPE_WB */
+
+ if (attr == MTRR_TYPE_UC)
+ return (PT_NOCACHE | PT_WRITETHRU);
+ if (attr == MTRR_TYPE_WB)
return (0);
- }
panic("unexpected memattr %x", attr);
- return (0);
}
-static gipt_pte_type_t
-rvi_pte_type(uint64_t pte, uint_t level)
+static uint64_t
+rvi_map_table(uint64_t pfn)
{
- if (RVI_IS_ABSENT(pte)) {
- return (PTET_EMPTY);
- } else if (RVI_MAPS_PAGE(pte, level)) {
- return (PTET_PAGE);
- } else {
- return (PTET_LINK);
- }
+ const uint64_t paddr = pfn_to_pa(pfn);
+ const uint64_t flags = PT_USER | PT_REF | PT_VALID;
+ const uint64_t pat = rvi_attr_to_pat(MTRR_TYPE_WB);
+ const uint64_t rprot = PT_WRITABLE;
+ return (paddr | flags | pat | rprot);
}
static uint64_t
-rvi_pte_map(uint64_t pfn)
+rvi_map_page(uint64_t pfn, uint_t prot, uint8_t attr)
+{
+ const uint64_t paddr = pfn_to_pa(pfn);
+ const uint64_t flags = PT_USER | PT_REF | PT_VALID;
+ const uint64_t pat = rvi_attr_to_pat(attr);
+ const uint64_t rprot = rvi_prot(prot);
+ return (paddr | flags | pat | rprot);
+}
+
+static pfn_t
+rvi_pte_pfn(uint64_t pte)
+{
+ return (mmu_btop(pte & PT_PADDR));
+}
+
+static bool
+rvi_pte_is_present(uint64_t pte)
+{
+ return ((pte & PT_VALID) == PT_VALID);
+}
+
+static uint_t
+rvi_reset_bits(volatile uint64_t *entry, uint64_t mask, uint64_t bits)
+{
+ uint64_t pte, newpte, oldpte = 0;
+
+ /*
+ * We use volatile and atomic ops here because we may be
+ * racing against hardware modifying these bits.
+ */
+ VERIFY3P(entry, !=, NULL);
+ oldpte = *entry;
+ do {
+ pte = oldpte;
+ newpte = (pte & ~mask) | bits;
+ oldpte = atomic_cas_64(entry, pte, newpte);
+ } while (oldpte != pte);
+
+ return (oldpte & mask);
+}
+
+static uint_t
+rvi_reset_dirty(uint64_t *entry, bool on)
+{
+ return (rvi_reset_bits(entry, PT_MOD, on ? (PT_MOD | PT_REF) : 0));
+}
+
+static uint_t
+rvi_reset_accessed(uint64_t *entry, bool on)
{
- return (RVI_PTE_ASSIGN_TABLE(pfn));
+ return (rvi_reset_bits(entry, (PT_MOD | PT_REF), on ? PT_REF : 0));
+}
+
+static vmm_pte_ops_t rvi_pte_ops = {
+ .vpeo_map_table = rvi_map_table,
+ .vpeo_map_page = rvi_map_page,
+ .vpeo_pte_pfn = rvi_pte_pfn,
+ .vpeo_pte_is_present = rvi_pte_is_present,
+ .vpeo_pte_prot = rvi_pte_prot,
+ .vpeo_reset_dirty = rvi_reset_dirty,
+ .vpeo_reset_accessed = rvi_reset_accessed,
+};
+
+vmm_gpt_t *
+rvi_create(void)
+{
+ return (vmm_gpt_alloc(&rvi_pte_ops));
}
static void *
-rvi_create(uintptr_t *pml4_kaddr)
+rvi_ops_create(uintptr_t *root_kaddr)
{
- rvi_map_t *rmap;
- gipt_map_t *map;
- gipt_t *root;
- struct gipt_cbs cbs = {
- .giptc_pte_type = rvi_pte_type,
- .giptc_pte_map = rvi_pte_map,
- };
-
- rmap = kmem_zalloc(sizeof (*rmap), KM_SLEEP);
- map = &rmap->rm_gipt;
- root = gipt_alloc();
- root->gipt_level = RVI_MAX_LEVELS - 1;
- gipt_map_init(map, RVI_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root);
-
- *pml4_kaddr = (uintptr_t)root->gipt_kva;
- return (rmap);
+ rvi_map_t *map;
+
+ map = kmem_zalloc(sizeof (*map), KM_SLEEP);
+ mutex_init(&map->rm_lock, NULL, MUTEX_DEFAULT, NULL);
+ map->rm_gpt = rvi_create();
+ *root_kaddr = (uintptr_t)vmm_gpt_root_kaddr(map->rm_gpt);
+
+ return (map);
}
static void
-rvi_destroy(void *arg)
+rvi_ops_destroy(void *arg)
{
- rvi_map_t *rmap = arg;
-
- if (rmap != NULL) {
- gipt_map_t *map = &rmap->rm_gipt;
+ rvi_map_t *map = arg;
- gipt_map_fini(map);
- kmem_free(rmap, sizeof (*rmap));
+ if (map != NULL) {
+ vmm_gpt_free(map->rm_gpt);
+ mutex_destroy(&map->rm_lock);
+ kmem_free(map, sizeof (*map));
}
}
static uint64_t
-rvi_wired_count(void *arg)
+rvi_ops_wired_count(void *arg)
{
- rvi_map_t *rmap = arg;
+ rvi_map_t *map = arg;
uint64_t res;
- mutex_enter(RVI_LOCK(rmap));
- res = rmap->rm_wired_page_count;
- mutex_exit(RVI_LOCK(rmap));
+ mutex_enter(&map->rm_lock);
+ res = vmm_gpt_mapped_count(map->rm_gpt);
+ mutex_exit(&map->rm_lock);
return (res);
}
static int
-rvi_is_wired(void *arg, uint64_t va, uint_t *protp)
+rvi_ops_is_wired(void *arg, uint64_t gpa, uint_t *protp)
{
- rvi_map_t *rmap = arg;
- gipt_t *pt;
- int rv = -1;
-
- mutex_enter(RVI_LOCK(rmap));
- pt = gipt_map_lookup_deepest(&rmap->rm_gipt, va);
- if (pt != NULL) {
- const uint64_t pte = GIPT_VA2PTE(pt, va);
-
- if (RVI_MAPS_PAGE(pte, pt->gipt_level)) {
- *protp = RVI_PTE_PROT(pte);
- rv = 0;
- }
- }
- mutex_exit(RVI_LOCK(rmap));
+ rvi_map_t *map = arg;
+ bool mapped;
+
+ mutex_enter(&map->rm_lock);
+ mapped = vmm_gpt_is_mapped(map->rm_gpt, gpa, protp);
+ mutex_exit(&map->rm_lock);
- return (rv);
+ return (mapped ? 0 : -1);
}
static int
-rvi_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot,
+rvi_ops_map(void *arg, uint64_t gpa, pfn_t pfn, uint_t _lvl, uint_t prot,
uint8_t attr)
{
- rvi_map_t *rmap = arg;
- gipt_map_t *map = &rmap->rm_gipt;
- gipt_t *pt;
- uint64_t *ptep, pte;
+ rvi_map_t *map = arg;
ASSERT((prot & PROT_READ) != 0);
ASSERT3U((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)), ==, 0);
- ASSERT3U(lvl, <, RVI_MAX_LEVELS);
-
- mutex_enter(RVI_LOCK(rmap));
- pt = gipt_map_lookup(map, va, lvl);
- if (pt == NULL) {
- /*
- * A table at the appropriate VA/level that would house this
- * mapping does not currently exist. Try to walk down to that
- * point, creating any necessary parent(s).
- */
- pt = gipt_map_create_parents(map, va, lvl);
-
- /*
- * There was a large page mapping in the way of creating the
- * necessary parent table(s).
- */
- if (pt == NULL) {
- panic("unexpected large page @ %08lx", va);
- }
- }
- ptep = GIPT_VA2PTEP(pt, va);
-
- pte = *ptep;
- if (!RVI_IS_ABSENT(pte)) {
- if (!RVI_MAPS_PAGE(pte, lvl)) {
- panic("unexpected PT link @ %08lx in %p", va, pt);
- } else {
- panic("unexpected page mapped @ %08lx in %p", va, pt);
- }
- }
- pte = RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr);
- *ptep = pte;
- pt->gipt_valid_cnt++;
- rmap->rm_wired_page_count += gipt_level_count[lvl];
+ mutex_enter(&map->rm_lock);
+ vmm_gpt_populate_entry(map->rm_gpt, gpa);
+ (void) vmm_gpt_map(map->rm_gpt, gpa, pfn, prot, attr);
+ mutex_exit(&map->rm_lock);
- mutex_exit(RVI_LOCK(rmap));
return (0);
}
static uint64_t
-rvi_unmap(void *arg, uint64_t va, uint64_t end_va)
+rvi_ops_unmap(void *arg, uint64_t start, uint64_t end)
{
- rvi_map_t *rmap = arg;
- gipt_map_t *map = &rmap->rm_gipt;
- gipt_t *pt;
- uint64_t cur_va = va;
- uint64_t unmapped = 0;
-
- mutex_enter(RVI_LOCK(rmap));
-
- pt = gipt_map_lookup_deepest(map, cur_va);
- if (pt == NULL) {
- mutex_exit(RVI_LOCK(rmap));
- return (0);
- }
- if (!RVI_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) {
- cur_va = gipt_map_next_page(map, cur_va, end_va, &pt);
- if (cur_va == 0) {
- mutex_exit(RVI_LOCK(rmap));
- return (0);
- }
- }
-
- while (cur_va < end_va) {
- uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va);
- const uint_t lvl = pt->gipt_level;
-
- ASSERT(RVI_MAPS_PAGE(*ptep, lvl));
- *ptep = 0;
- pt->gipt_valid_cnt--;
- unmapped += gipt_level_count[pt->gipt_level];
-
- gipt_t *next_pt = pt;
- uint64_t next_va;
- next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt);
-
- if (pt->gipt_valid_cnt == 0) {
- gipt_map_clean_parents(map, pt);
- }
- if (next_va == 0) {
- break;
- }
- pt = next_pt;
- cur_va = next_va;
- }
- rmap->rm_wired_page_count -= unmapped;
+ rvi_map_t *map = arg;
+ size_t unmapped = 0;
- mutex_exit(RVI_LOCK(rmap));
+ mutex_enter(&map->rm_lock);
+ unmapped = vmm_gpt_unmap_region(map->rm_gpt, start, end);
+ vmm_gpt_vacate_region(map->rm_gpt, start, end);
+ mutex_exit(&map->rm_lock);
- return (unmapped);
+ return ((uint64_t)unmapped);
}
struct vmm_pt_ops rvi_ops = {
- .vpo_init = rvi_create,
- .vpo_free = rvi_destroy,
- .vpo_wired_cnt = rvi_wired_count,
- .vpo_is_wired = rvi_is_wired,
- .vpo_map = rvi_map,
- .vpo_unmap = rvi_unmap,
+ .vpo_init = rvi_ops_create,
+ .vpo_free = rvi_ops_destroy,
+ .vpo_wired_cnt = rvi_ops_wired_count,
+ .vpo_is_wired = rvi_ops_is_wired,
+ .vpo_map = rvi_ops_map,
+ .vpo_unmap = rvi_ops_unmap,
};
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
index 3a29a3e7b3..720af54200 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
@@ -35,6 +35,7 @@
#include <vm/seg_vmm.h>
#include <machine/vm.h>
+#include <sys/vmm_gpt.h>
#include <sys/vmm_vm.h>
#define PMAP_TO_VMMAP(pm) ((vm_map_t) \
diff --git a/usr/src/uts/i86pc/os/gipt.c b/usr/src/uts/i86pc/os/gipt.c
deleted file mode 100644
index 7bff5c3897..0000000000
--- a/usr/src/uts/i86pc/os/gipt.c
+++ /dev/null
@@ -1,568 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2019 Joyent, Inc.
- */
-
-#include <sys/gipt.h>
-#include <sys/malloc.h>
-#include <sys/kmem.h>
-#include <sys/sysmacros.h>
-#include <sys/sunddi.h>
-#include <sys/panic.h>
-#include <vm/hat.h>
-#include <vm/as.h>
-
-/*
- * Generic Indexed Page Table
- *
- * There are several applications, such as hardware virtualization or IOMMU
- * control, which require construction of a page table tree to represent a
- * virtual address space. Many features of the existing htable system would be
- * convenient for this, but its tight coupling to the VM system make it
- * undesirable for independent consumers. The GIPT interface exists to provide
- * page table allocation and indexing on top of which a table hierarchy
- * (EPT, VT-d, etc) can be built by upstack logic.
- *
- * Types:
- *
- * gipt_t - Represents a single page table with a physical backing page and
- * associated metadata.
- * gipt_map_t - The workhorse of this facility, it contains an hash table to
- * index all of the gipt_t entries which make up the page table tree.
- * struct gipt_cbs - Callbacks used by the gipt_map_t:
- * gipt_pte_type_cb_t - Given a PTE, emit the type (empty/page/table)
- * gipt_pte_map_cb_t - Given a PFN, emit a (child) table mapping
- */
-
-/*
- * For now, the level shifts are hard-coded to match with standard 4-level
- * 64-bit paging structures.
- */
-
-#define GIPT_HASH(map, va, lvl) \
- ((((va) >> 12) + ((va) >> 28) + (lvl)) & ((map)->giptm_table_cnt - 1))
-
-const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1] = {
- 12, /* 4K */
- 21, /* 2M */
- 30, /* 1G */
- 39, /* 512G */
- 48 /* MAX */
-};
-const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1] = {
- 0xfffffffffffff000ull, /* 4K */
- 0xffffffffffe00000ull, /* 2M */
- 0xffffffffc0000000ull, /* 1G */
- 0xffffff8000000000ull, /* 512G */
- 0xffff000000000000ull /* MAX */
-};
-const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1] = {
- 0x0000000000001000ull, /* 4K */
- 0x0000000000200000ull, /* 2M */
- 0x0000000040000000ull, /* 1G */
- 0x0000008000000000ull, /* 512G */
- 0x0001000000000000ull /* MAX */
-};
-const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1] = {
- 0x0000000000000001ull, /* 4K */
- 0x0000000000000200ull, /* 2M */
- 0x0000000000040000ull, /* 1G */
- 0x0000000008000000ull, /* 512G */
- 0x0000001000000000ull /* MAX */
-};
-
-/*
- * Allocate a gipt_t structure with corresponding page of memory to hold the
- * PTEs which it contains.
- */
-gipt_t *
-gipt_alloc(void)
-{
- gipt_t *pt;
- void *page;
-
- pt = kmem_zalloc(sizeof (*pt), KM_SLEEP);
- page = kmem_zalloc(PAGESIZE, KM_SLEEP);
- pt->gipt_kva = page;
- pt->gipt_pfn = hat_getpfnum(kas.a_hat, page);
-
- return (pt);
-}
-
-/*
- * Free a gipt_t structure along with its page of PTE storage.
- */
-void
-gipt_free(gipt_t *pt)
-{
- void *page = pt->gipt_kva;
-
- ASSERT(pt->gipt_pfn != PFN_INVALID);
- ASSERT(pt->gipt_kva != NULL);
-
- pt->gipt_pfn = PFN_INVALID;
- pt->gipt_kva = NULL;
-
- kmem_free(page, PAGESIZE);
- kmem_free(pt, sizeof (*pt));
-}
-
-/*
- * Initialize a gipt_map_t with a max level (must be >= 1) and allocating its
- * hash table based on a provided size (must be a power of 2).
- */
-void
-gipt_map_init(gipt_map_t *map, uint_t levels, uint_t hash_table_size,
- const struct gipt_cbs *cbs, gipt_t *root)
-{
- VERIFY(map->giptm_root == NULL);
- VERIFY(map->giptm_hash == NULL);
- VERIFY3U(levels, >, 0);
- VERIFY3U(levels, <=, GIPT_MAX_LEVELS);
- VERIFY(ISP2(hash_table_size));
- VERIFY(root != NULL);
-
- mutex_init(&map->giptm_lock, NULL, MUTEX_DEFAULT, NULL);
- map->giptm_table_cnt = hash_table_size;
- bcopy(cbs, &map->giptm_cbs, sizeof (*cbs));
- map->giptm_hash = kmem_alloc(sizeof (list_t) * map->giptm_table_cnt,
- KM_SLEEP);
- for (uint_t i = 0; i < hash_table_size; i++) {
- list_create(&map->giptm_hash[i], sizeof (gipt_t),
- offsetof(gipt_t, gipt_node));
- }
- map->giptm_levels = levels;
-
- /*
- * Insert the table root into the hash. It will be held in existence
- * with an extra "valid" reference. This will prevent its clean-up
- * during gipt_map_clean_parents() calls, even if it has no children.
- */
- mutex_enter(&map->giptm_lock);
- gipt_map_insert(map, root);
- map->giptm_root = root;
- root->gipt_valid_cnt++;
- mutex_exit(&map->giptm_lock);
-}
-
-/*
- * Clean up a gipt_map_t by removing any lingering gipt_t entries referenced by
- * it, and freeing its hash table.
- */
-void
-gipt_map_fini(gipt_map_t *map)
-{
- const uint_t cnt = map->giptm_table_cnt;
- const size_t sz = sizeof (list_t) * cnt;
-
- mutex_enter(&map->giptm_lock);
- /* Clean up any lingering tables */
- for (uint_t i = 0; i < cnt; i++) {
- list_t *list = &map->giptm_hash[i];
- gipt_t *pt;
-
- while ((pt = list_remove_head(list)) != NULL) {
- gipt_free(pt);
- }
- ASSERT(list_is_empty(list));
- }
-
- kmem_free(map->giptm_hash, sz);
- map->giptm_hash = NULL;
- map->giptm_root = NULL;
- map->giptm_levels = 0;
- mutex_exit(&map->giptm_lock);
- mutex_destroy(&map->giptm_lock);
-}
-
-/*
- * Look in the map for a gipt_t containing a given VA which is located at a
- * specified level.
- */
-gipt_t *
-gipt_map_lookup(gipt_map_t *map, uint64_t va, uint_t lvl)
-{
- gipt_t *pt;
-
- ASSERT(MUTEX_HELD(&map->giptm_lock));
- ASSERT3U(lvl, <=, GIPT_MAX_LEVELS);
-
- /*
- * Lookup gipt_t at the VA aligned to the next level up. For example,
- * level 0 corresponds to a page table containing 512 PTEs which cover
- * 4k each, spanning a total 2MB. As such, the base VA of that table
- * must be aligned to the same 2MB.
- */
- const uint64_t masked_va = va & gipt_level_mask[lvl + 1];
- const uint_t hash = GIPT_HASH(map, masked_va, lvl);
-
- /* Only the root is expected to be at the top level. */
- if (lvl == (map->giptm_levels - 1) && map->giptm_root != NULL) {
- pt = map->giptm_root;
-
- ASSERT3U(pt->gipt_level, ==, lvl);
-
- /*
- * It may be so that the VA in question is not covered by the
- * range of the table root.
- */
- if (pt->gipt_vaddr != masked_va) {
- return (NULL);
- }
-
- return (pt);
- }
-
- list_t *list = &map->giptm_hash[hash];
- for (pt = list_head(list); pt != NULL; pt = list_next(list, pt)) {
- if (pt->gipt_vaddr == masked_va && pt->gipt_level == lvl)
- break;
- }
- return (pt);
-}
-
-/*
- * Look in the map for the deepest (lowest level) gipt_t which contains a given
- * VA. This could still fail if the VA is outside the range of the table root.
- */
-gipt_t *
-gipt_map_lookup_deepest(gipt_map_t *map, uint64_t va)
-{
- gipt_t *pt = NULL;
- uint_t lvl;
-
- ASSERT(MUTEX_HELD(&map->giptm_lock));
-
- for (lvl = 0; lvl < map->giptm_levels; lvl++) {
- pt = gipt_map_lookup(map, va, lvl);
- if (pt != NULL) {
- break;
- }
- }
- return (pt);
-}
-
-/*
- * Given a VA inside a gipt_t, calculate (based on the level of that PT) the VA
- * corresponding to the next entry in the table. It returns 0 if that VA would
- * fall beyond the bounds of the table.
- */
-static __inline__ uint64_t
-gipt_next_va(gipt_t *pt, uint64_t va)
-{
- const uint_t lvl = pt->gipt_level;
- const uint64_t masked = va & gipt_level_mask[lvl];
- const uint64_t max = pt->gipt_vaddr + gipt_level_size[lvl+1];
- const uint64_t next = masked + gipt_level_size[lvl];
-
- ASSERT3U(masked, >=, pt->gipt_vaddr);
- ASSERT3U(masked, <, max);
-
- /*
- * If the "next" VA would be outside this table, including cases where
- * it overflowed, indicate an error result.
- */
- if (next >= max || next <= masked) {
- return (0);
- }
- return (next);
-}
-
-/*
- * For a given VA, find the next VA which corresponds to a valid page mapping.
- * The gipt_t containing that VA will be indicated via 'ptp'. (The gipt_t of
- * the starting VA can be passed in via 'ptp' for a minor optimization). If
- * there is no valid mapping higher than 'va' but contained within 'max_va',
- * then this will indicate failure with 0 returned.
- */
-uint64_t
-gipt_map_next_page(gipt_map_t *map, uint64_t va, uint64_t max_va, gipt_t **ptp)
-{
- gipt_t *pt = *ptp;
- uint64_t cur_va = va;
- gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type;
-
- ASSERT(MUTEX_HELD(&map->giptm_lock));
- ASSERT3U(max_va, !=, 0);
- ASSERT3U(ptp, !=, NULL);
-
- /*
- * If a starting table is not provided, search the map for the deepest
- * table which contains the VA. If for some reason that VA is beyond
- * coverage of the map root, indicate failure.
- */
- if (pt == NULL) {
- pt = gipt_map_lookup_deepest(map, cur_va);
- if (pt == NULL) {
- goto fail;
- }
- }
-
- /*
- * From the starting table (at whatever level that may reside), walk
- * forward through the PTEs looking for a valid page mapping.
- */
- while (cur_va < max_va) {
- const uint64_t next_va = gipt_next_va(pt, cur_va);
- if (next_va == 0) {
- /*
- * The end of this table has been reached. Ascend one
- * level to continue the walk if possible. If already
- * at the root, the end of the table means failure.
- */
- if (pt->gipt_level >= map->giptm_levels) {
- goto fail;
- }
- pt = gipt_map_lookup(map, cur_va, pt->gipt_level + 1);
- if (pt == NULL) {
- goto fail;
- }
- continue;
- } else if (next_va >= max_va) {
- /*
- * Terminate the walk with a failure if the VA
- * corresponding to the next PTE is beyond the max.
- */
- goto fail;
- }
- cur_va = next_va;
-
- const uint64_t pte = GIPT_VA2PTE(pt, cur_va);
- const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level);
- if (ptet == PTET_EMPTY) {
- continue;
- } else if (ptet == PTET_PAGE) {
- /* A valid page mapping: success. */
- *ptp = pt;
- return (cur_va);
- } else if (ptet == PTET_LINK) {
- /*
- * A child page table is present at this PTE. Look it
- * up from the map.
- */
- ASSERT3U(pt->gipt_level, >, 0);
- pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1);
- ASSERT3P(pt, !=, NULL);
- break;
- } else {
- panic("unexpected PTE type %x @ va %p", ptet,
- (void *)cur_va);
- }
- }
-
- /*
- * By this point, the above loop has located a table structure to
- * descend into in order to find the next page.
- */
- while (cur_va < max_va) {
- const uint64_t pte = GIPT_VA2PTE(pt, cur_va);
- const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level);
-
- if (ptet == PTET_EMPTY) {
- const uint64_t next_va = gipt_next_va(pt, cur_va);
- if (next_va == 0 || next_va >= max_va) {
- goto fail;
- }
- cur_va = next_va;
- continue;
- } else if (ptet == PTET_PAGE) {
- /* A valid page mapping: success. */
- *ptp = pt;
- return (cur_va);
- } else if (ptet == PTET_LINK) {
- /*
- * A child page table is present at this PTE. Look it
- * up from the map.
- */
- ASSERT3U(pt->gipt_level, >, 0);
- pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1);
- ASSERT3P(pt, !=, NULL);
- } else {
- panic("unexpected PTE type %x @ va %p", ptet,
- (void *)cur_va);
- }
- }
-
-fail:
- *ptp = NULL;
- return (0);
-}
-
-/*
- * Insert a gipt_t into the map based on its VA and level. It is up to the
- * caller to ensure that a duplicate entry does not already exist in the map.
- */
-void
-gipt_map_insert(gipt_map_t *map, gipt_t *pt)
-{
- const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level);
-
- ASSERT(MUTEX_HELD(&map->giptm_lock));
- ASSERT(gipt_map_lookup(map, pt->gipt_vaddr, pt->gipt_level) == NULL);
- VERIFY3U(pt->gipt_level, <, map->giptm_levels);
-
- list_insert_head(&map->giptm_hash[hash], pt);
-}
-
-/*
- * Remove a gipt_t from the map.
- */
-void
-gipt_map_remove(gipt_map_t *map, gipt_t *pt)
-{
- const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level);
-
- ASSERT(MUTEX_HELD(&map->giptm_lock));
-
- list_remove(&map->giptm_hash[hash], pt);
-}
-
-/*
- * Given a VA, create any missing gipt_t entries from the specified level all
- * the way up to (but not including) the root. This is done from lowest level
- * to highest, and stops when an existing table covering that VA is found.
- * References to any created gipt_t tables, plus the final "found" gipt_t are
- * stored in 'pts'. The number of gipt_t pointers stored to 'pts' serves as
- * the return value (1 <= val <= root level). It is up to the caller to
- * populate linking PTEs to the newly created empty tables.
- */
-static uint_t
-gipt_map_ensure_chain(gipt_map_t *map, uint64_t va, uint_t lvl, gipt_t **pts)
-{
- const uint_t root_lvl = map->giptm_root->gipt_level;
- uint_t clvl = lvl, count = 0;
- gipt_t *child_pt = NULL;
-
- ASSERT(MUTEX_HELD(&map->giptm_lock));
- ASSERT3U(lvl, <, root_lvl);
- ASSERT3P(map->giptm_root, !=, NULL);
-
- do {
- const uint64_t pva = (va & gipt_level_mask[clvl + 1]);
- gipt_t *pt;
-
- pt = gipt_map_lookup(map, pva, clvl);
- if (pt != NULL) {
- ASSERT3U(pva, ==, pt->gipt_vaddr);
-
- if (child_pt != NULL) {
- child_pt->gipt_parent = pt;
- }
- pts[count++] = pt;
- return (count);
- }
-
- pt = gipt_alloc();
- pt->gipt_vaddr = pva;
- pt->gipt_level = clvl;
- if (child_pt != NULL) {
- child_pt->gipt_parent = pt;
- }
-
- gipt_map_insert(map, pt);
- child_pt = pt;
- pts[count++] = pt;
- clvl++;
- } while (clvl <= root_lvl);
-
- return (count);
-}
-
-/*
- * Ensure that a page table covering a VA at a specified level exists. This
- * will create any necessary tables chaining up to the root as well.
- */
-gipt_t *
-gipt_map_create_parents(gipt_map_t *map, uint64_t va, uint_t lvl)
-{
- gipt_t *pt, *pts[GIPT_MAX_LEVELS] = { 0 };
- gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type;
- gipt_pte_map_cb_t pte_map = map->giptm_cbs.giptc_pte_map;
- uint64_t *ptep;
- uint_t i, count;
-
- ASSERT(MUTEX_HELD(&map->giptm_lock));
-
- count = gipt_map_ensure_chain(map, va, lvl, pts);
- if (count == 1) {
- /* Table already exists in the hierarchy */
- return (pts[0]);
- }
- ASSERT3U(count, >, 1);
-
- /* Make sure there is not already a large page mapping at the top */
- pt = pts[count - 1];
- if (pte_type(GIPT_VA2PTE(pt, va), pt->gipt_level) == PTET_PAGE) {
- const uint_t end = count - 1;
-
- /*
- * Nuke those gipt_t entries which were optimistically created
- * for what was found to be a conflicted mapping.
- */
- for (i = 0; i < end; i++) {
- gipt_map_remove(map, pts[i]);
- gipt_free(pts[i]);
- }
- return (NULL);
- }
-
- /* Initialize the appropriate tables from bottom to top */
- for (i = 1; i < count; i++) {
- pt = pts[i];
- ptep = GIPT_VA2PTEP(pt, va);
-
- /*
- * Since gipt_map_ensure_chain() creates missing tables until
- * it find a valid one, and that existing table has been
- * checked for the existence of a large page, nothing should
- * occupy this PTE.
- */
- ASSERT3U(pte_type(*ptep, pt->gipt_level), ==, PTET_EMPTY);
-
- *ptep = pte_map(pts[i - 1]->gipt_pfn);
- pt->gipt_valid_cnt++;
- }
-
- return (pts[0]);
-}
-
-/*
- * If a page table is empty, free it from the map, as well as any parent tables
- * that would subsequently become empty as part of the clean-up. As noted in
- * gipt_map_init(), the table root is a special case and will remain in the
- * map, even when empty.
- */
-void
-gipt_map_clean_parents(gipt_map_t *map, gipt_t *pt)
-{
- ASSERT(MUTEX_HELD(&map->giptm_lock));
-
- while (pt->gipt_valid_cnt == 0) {
- gipt_t *parent = pt->gipt_parent;
- uint64_t *ptep = GIPT_VA2PTEP(parent, pt->gipt_vaddr);
-
- ASSERT3S(map->giptm_cbs.giptc_pte_type(*ptep,
- parent->gipt_level), ==, PTET_LINK);
-
- /*
- * For now, it is assumed that all gipt consumers consider PTE
- * zeroing as an adequate action for table unmap.
- */
- *ptep = 0;
-
- parent->gipt_valid_cnt--;
- gipt_map_remove(map, pt);
- gipt_free(pt);
- pt = parent;
- }
-}
diff --git a/usr/src/uts/i86pc/sys/gipt.h b/usr/src/uts/i86pc/sys/gipt.h
deleted file mode 100644
index 4d7d523726..0000000000
--- a/usr/src/uts/i86pc/sys/gipt.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2019 Joyent, Inc.
- */
-
-#ifndef _GIPT_H_
-#define _GIPT_H_
-
-#include <sys/types.h>
-#include <sys/mutex.h>
-#include <sys/param.h>
-#include <sys/list.h>
-
-struct gipt {
- list_node_t gipt_node;
- uint64_t gipt_vaddr;
- uint64_t gipt_pfn;
- uint16_t gipt_level;
- uint16_t gipt_valid_cnt;
- uint32_t _gipt_pad;
- struct gipt *gipt_parent;
- uint64_t *gipt_kva;
- uint64_t _gipt_pad2;
-};
-typedef struct gipt gipt_t;
-
-typedef enum {
- PTET_EMPTY = 0,
- PTET_PAGE = 1,
- PTET_LINK = 2,
-} gipt_pte_type_t;
-
-/* Given a PTE and its level, determine the type of that PTE */
-typedef gipt_pte_type_t (*gipt_pte_type_cb_t)(uint64_t, uint_t);
-/* Given the PFN of a child table, emit a PTE that references it */
-typedef uint64_t (*gipt_pte_map_cb_t)(uint64_t);
-
-struct gipt_cbs {
- gipt_pte_type_cb_t giptc_pte_type;
- gipt_pte_map_cb_t giptc_pte_map;
-};
-
-struct gipt_map {
- kmutex_t giptm_lock;
- gipt_t *giptm_root;
- list_t *giptm_hash;
- struct gipt_cbs giptm_cbs;
- size_t giptm_table_cnt;
- uint_t giptm_levels;
-};
-typedef struct gipt_map gipt_map_t;
-
-#define GIPT_HASH_SIZE_DEFAULT 0x2000
-#define GIPT_MAX_LEVELS 4
-
-#define GIPT_VA2IDX(pt, va) \
- (((va) - (pt)->gipt_vaddr) >> \
- gipt_level_shift[(pt)->gipt_level])
-
-#define GIPT_VA2PTE(pt, va) ((pt)->gipt_kva[GIPT_VA2IDX(pt, va)])
-#define GIPT_VA2PTEP(pt, va) (&(pt)->gipt_kva[GIPT_VA2IDX(pt, va)])
-
-extern const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1];
-extern const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1];
-extern const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1];
-extern const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1];
-
-extern gipt_t *gipt_alloc(void);
-extern void gipt_free(gipt_t *);
-extern void gipt_map_init(gipt_map_t *, uint_t, uint_t,
- const struct gipt_cbs *, gipt_t *);
-extern void gipt_map_fini(gipt_map_t *);
-extern gipt_t *gipt_map_lookup(gipt_map_t *, uint64_t, uint_t);
-extern gipt_t *gipt_map_lookup_deepest(gipt_map_t *, uint64_t);
-extern uint64_t gipt_map_next_page(gipt_map_t *, uint64_t, uint64_t,
- gipt_t **);
-extern void gipt_map_insert(gipt_map_t *, gipt_t *);
-extern void gipt_map_remove(gipt_map_t *, gipt_t *);
-extern gipt_t *gipt_map_create_parents(gipt_map_t *, uint64_t, uint_t);
-extern void gipt_map_clean_parents(gipt_map_t *, gipt_t *);
-
-#endif /* _GIPT_H_ */