summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorHans Rosenfeld <hans.rosenfeld@joyent.com>2018-12-20 10:53:02 +0000
committerHans Rosenfeld <hans.rosenfeld@joyent.com>2018-12-20 11:10:07 +0000
commit7acaa8e37be75212e7823e35fc1b9a4cc61f601b (patch)
treeee281dc72c2131ac873a60743419d748e995b9e4 /usr/src
parent2319d4707150167d5bc9a32748d310d0c5e7a022 (diff)
downloadillumos-joyent-7acaa8e37be75212e7823e35fc1b9a4cc61f601b.tar.gz
OS-7408 want native libvmm
Reviewed by: John Levon <john.levon@joyent.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Approved by: John Levon <john.levon@joyent.com>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/lib/Makefile4
-rw-r--r--usr/src/lib/libvmm/Makefile57
-rw-r--r--usr/src/lib/libvmm/libvmm.c841
-rw-r--r--usr/src/lib/libvmm/libvmm.h122
-rw-r--r--usr/src/lib/libvmm/mapfile-vers60
5 files changed, 1083 insertions, 1 deletions
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index dfff137c67..e04e4d82cd 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -281,6 +281,7 @@ SUBDIRS += \
i386_SUBDIRS= \
libfdisk \
libsaveargs \
+ libvmm \
libvmmapi
sparc_SUBDIRS= \
@@ -512,6 +513,7 @@ HDRSUBDIRS= \
i386_HDRSUBDIRS= \
libfdisk \
libsaveargs \
+ libvmm \
libvmmapi
sparc_HDRSUBDIRS= \
@@ -617,7 +619,7 @@ libdiskmgt: libdevid libdevinfo libadm libefi libkstat libsysevent
$(INTEL_BLD)libdiskmgt: libfdisk
libdladm: libdevinfo libinetutil libscf librcm libexacct libkstat \
libpool varpd
-libdll: libast
+libdll: libast
libdlpi: libinetutil libdladm
libds: libsysevent
libdtrace: libproc libgen libctf libmapmalloc
diff --git a/usr/src/lib/libvmm/Makefile b/usr/src/lib/libvmm/Makefile
new file mode 100644
index 0000000000..2a0de6c5dc
--- /dev/null
+++ b/usr/src/lib/libvmm/Makefile
@@ -0,0 +1,57 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+LIBRARY = libvmm.a
+VERS = .1
+
+OBJECTS = libvmm.o list.o
+
+include ../Makefile.lib
+include ../Makefile.lib.64
+include ../Makefile.rootfs
+
+SRCDIR = .
+COMDIR = $(SRC)/common/list
+
+LIBS = $(DYNLIB)
+
+# The FreeBSD compat and contrib headers need to be first in the search
+# path, hence we can't just append them to CPPFLAGS. So we assign CPPFLAGS
+# directly and pull in CPPFLAGS.master at the appropriate place.
+CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
+ -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 \
+ $(CPPFLAGS.master) -I$(SRC)/uts/i86pc
+
+LDLIBS += -lc -lvmmapi
+
+HDRS = libvmm.h
+HDRDIR = .
+CHECKHDRS = $(HDRS:%.h=%.check)
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+install_h: $(ROOTHDRS)
+
+check: $(CHECKHDRS)
+
+pics/%.o: $(COMDIR)/%.c
+ $(COMPILE.c) -o $@ $<
+ $(POST_PROCESS_O)
+
+install: install_h all $(ROOTLIBS64)
+
+include ../Makefile.targ
diff --git a/usr/src/lib/libvmm/libvmm.c b/usr/src/lib/libvmm/libvmm.c
new file mode 100644
index 0000000000..373dfe2844
--- /dev/null
+++ b/usr/src/lib/libvmm/libvmm.c
@@ -0,0 +1,841 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Library for native code to access bhyve VMs, without the need to use
+ * FreeBSD compat headers
+ */
+
+#include <sys/param.h>
+#include <sys/list.h>
+#include <sys/stddef.h>
+#include <sys/mman.h>
+#include <sys/kdi_regs.h>
+#include <sys/sysmacros.h>
+#include <sys/controlregs.h>
+#include <sys/note.h>
+#include <sys/debug.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <libvmm.h>
+
+typedef struct vmm_memseg vmm_memseg_t;
+
+#define VMM_MEMSEG_DEVMEM 0x1
+
+struct vmm_memseg {
+ list_node_t vms_list;
+ int vms_segid;
+ int vms_prot;
+ int vms_flags;
+ uintptr_t vms_gpa;
+ off_t vms_segoff;
+ size_t vms_seglen;
+ size_t vms_maplen;
+ char vms_name[64];
+};
+
+struct vmm {
+ struct vmctx *vmm_ctx;
+ list_t vmm_memlist;
+ char *vmm_mem;
+ size_t vmm_memsize;
+ size_t vmm_ncpu;
+};
+
+
+/*
+ * This code relies on two assumptions:
+ * - CPUs are never removed from the "active set", not even when suspended.
+ * A CPU being active just means that it has been used by the guest OS.
+ * - The CPU numbering is consecutive.
+ */
+static void
+vmm_update_ncpu(vmm_t *vmm)
+{
+ cpuset_t cpuset;
+
+ assert(vm_active_cpus(vmm->vmm_ctx, &cpuset) == 0);
+
+ for (vmm->vmm_ncpu = 0;
+ CPU_ISSET(vmm->vmm_ncpu, &cpuset) == 1;
+ vmm->vmm_ncpu++)
+ ;
+}
+
+vmm_t *
+vmm_open_vm(const char *name)
+{
+ vmm_t *vmm = NULL;
+
+ vmm = malloc(sizeof (vmm_t));
+ if (vmm == NULL)
+ return (NULL);
+
+ bzero(vmm, sizeof (vmm_t));
+ vmm->vmm_mem = MAP_FAILED;
+
+ list_create(&vmm->vmm_memlist, sizeof (vmm_memseg_t),
+ offsetof(vmm_memseg_t, vms_list));
+
+ vmm->vmm_ctx = vm_open(name);
+ if (vmm->vmm_ctx == NULL) {
+ free(vmm);
+ return (NULL);
+ }
+
+ vmm_update_ncpu(vmm);
+
+ return (vmm);
+}
+
+void
+vmm_close_vm(vmm_t *vmm)
+{
+ vmm_unmap(vmm);
+
+ list_destroy(&vmm->vmm_memlist);
+
+ if (vmm->vmm_ctx != NULL)
+ vm_close(vmm->vmm_ctx);
+
+ free(vmm);
+}
+
+static vmm_memseg_t *
+vmm_get_memseg(vmm_t *vmm, uintptr_t gpa)
+{
+ vmm_memseg_t ms, *ret;
+ int error, flags;
+
+ bzero(&ms, sizeof (vmm_memseg_t));
+ ms.vms_gpa = gpa;
+ error = vm_mmap_getnext(vmm->vmm_ctx, &ms.vms_gpa, &ms.vms_segid,
+ &ms.vms_segoff, &ms.vms_maplen, &ms.vms_prot, &flags);
+ if (error)
+ return (NULL);
+
+ error = vm_get_memseg(vmm->vmm_ctx, ms.vms_segid, &ms.vms_seglen,
+ ms.vms_name, sizeof (ms.vms_name));
+ if (error)
+ return (NULL);
+
+ /*
+ * Regular memory segments don't have a name, but devmem segments do.
+ * We can use that information to set the DEVMEM flag if necessary.
+ */
+ ms.vms_flags = ms.vms_name[0] != '\0' ? VMM_MEMSEG_DEVMEM : 0;
+
+ ret = malloc(sizeof (vmm_memseg_t));
+ if (ret == NULL)
+ return (NULL);
+
+ *ret = ms;
+
+ return (ret);
+}
+
+int
+vmm_map(vmm_t *vmm, boolean_t writable)
+{
+ uintptr_t last_gpa = 0;
+ vmm_memseg_t *ms;
+ int prot_write = writable ? PROT_WRITE : 0;
+
+ if (vmm->vmm_mem != MAP_FAILED) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ assert(list_is_empty(&vmm->vmm_memlist));
+
+ for (;;) {
+ ms = vmm_get_memseg(vmm, last_gpa);
+
+ if (ms == NULL)
+ break;
+
+ last_gpa = ms->vms_gpa + ms->vms_maplen;
+ list_insert_tail(&vmm->vmm_memlist, ms);
+ }
+
+ vmm->vmm_mem = mmap(NULL, last_gpa, PROT_NONE,
+ MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, -1, 0);
+
+ if (vmm->vmm_mem == MAP_FAILED)
+ goto fail;
+
+ for (ms = list_head(&vmm->vmm_memlist);
+ ms != NULL;
+ ms = list_next(&vmm->vmm_memlist, ms)) {
+ off_t mapoff = ms->vms_gpa;
+
+ if ((ms->vms_flags & VMM_MEMSEG_DEVMEM) &&
+ vm_get_devmem_offset(vmm->vmm_ctx, ms->vms_segid, &mapoff)
+ != 0)
+ goto fail;
+
+ vmm->vmm_memsize += ms->vms_maplen;
+
+ if (mmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen,
+ PROT_READ | prot_write, MAP_SHARED | MAP_FIXED,
+ vm_get_device_fd(vmm->vmm_ctx), mapoff) == MAP_FAILED)
+ goto fail;
+ }
+
+ return (0);
+
+fail:
+ vmm_unmap(vmm);
+
+ return (-1);
+}
+
+void
+vmm_unmap(vmm_t *vmm)
+{
+ while (!list_is_empty(&vmm->vmm_memlist)) {
+ vmm_memseg_t *ms = list_remove_head(&vmm->vmm_memlist);
+
+ if (vmm->vmm_mem != MAP_FAILED)
+ munmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen);
+ free(ms);
+ }
+
+ if (vmm->vmm_mem != MAP_FAILED)
+ munmap(vmm->vmm_mem, vmm->vmm_memsize);
+
+ vmm->vmm_mem = MAP_FAILED;
+ vmm->vmm_memsize = 0;
+}
+
+ssize_t
+vmm_pread(vmm_t *vmm, void *buf, size_t len, uintptr_t addr)
+{
+ ssize_t count = 0;
+ vmm_memseg_t *ms;
+ ssize_t res = len;
+
+ for (ms = list_head(&vmm->vmm_memlist);
+ ms != NULL && len != 0;
+ ms = list_next(&vmm->vmm_memlist, ms)) {
+
+ if (addr >= ms->vms_gpa &&
+ addr < ms->vms_gpa + ms->vms_maplen) {
+ res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
+
+ if (res < 0)
+ res = 0;
+
+ bcopy(vmm->vmm_mem + addr, buf, len - res);
+ count += len - res;
+ addr += len - res;
+ len = res;
+ }
+ }
+
+ if (res)
+ errno = EFAULT;
+ else
+ errno = 0;
+
+ return (count);
+}
+
+ssize_t
+vmm_pwrite(vmm_t *vmm, const void *buf, size_t len, uintptr_t addr)
+{
+ ssize_t count = 0;
+ vmm_memseg_t *ms;
+ ssize_t res = len;
+
+ for (ms = list_head(&vmm->vmm_memlist);
+ ms != NULL;
+ ms = list_next(&vmm->vmm_memlist, ms)) {
+ if (addr >= ms->vms_gpa &&
+ addr < ms->vms_gpa + ms->vms_maplen) {
+ res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
+
+ if (res < 0)
+ res = 0;
+
+ bcopy(buf, vmm->vmm_mem + addr, len - res);
+ count += len - res;
+ addr += len - res;
+ len = res;
+ }
+ }
+
+ if (res)
+ errno = EFAULT;
+ else
+ errno = 0;
+
+ return (count);
+}
+
+size_t
+vmm_ncpu(vmm_t *vmm)
+{
+ return (vmm->vmm_ncpu);
+}
+
+size_t
+vmm_memsize(vmm_t *vmm)
+{
+ return (vmm->vmm_memsize);
+}
+
+void
+vmm_cont(vmm_t *vmm)
+{
+ assert(vm_resume_cpu(vmm->vmm_ctx, -1) == 0);
+}
+
+int
+vmm_step(vmm_t *vmm, int vcpu)
+{
+ cpuset_t cpuset;
+ int ret;
+
+ if (vcpu >= vmm->vmm_ncpu) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ ret = vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 1);
+ if (ret != 0)
+ return (-1);
+
+ assert(vm_resume_cpu(vmm->vmm_ctx, vcpu) == 0);
+
+ do {
+ (void) vm_debug_cpus(vmm->vmm_ctx, &cpuset);
+ } while (!CPU_ISSET(vcpu, &cpuset));
+
+ (void) vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 0);
+
+ return (ret);
+}
+
+void
+vmm_stop(vmm_t *vmm)
+{
+ assert(vm_suspend_cpu(vmm->vmm_ctx, -1) == 0);
+
+ vmm_update_ncpu(vmm);
+}
+
+/*
+ * Mapping of KDI-defined registers to vmmapi-defined registers.
+ * Registers not known to vmmapi use VM_REG_LAST, which is invalid and
+ * causes an error in vm_{get,set}_register_set().
+ *
+ * This array must be kept in sync with the definitions in kdi_regs.h.
+ */
+static int vmm_kdi_regmap[] = {
+ VM_REG_LAST, /* KDIREG_SAVFP */
+ VM_REG_LAST, /* KDIREG_SAVPC */
+ VM_REG_GUEST_RDI, /* KDIREG_RDI */
+ VM_REG_GUEST_RSI, /* KDIREG_RSI */
+ VM_REG_GUEST_RDX, /* KDIREG_RDX */
+ VM_REG_GUEST_RCX, /* KDIREG_RCX */
+ VM_REG_GUEST_R8, /* KDIREG_R8 */
+ VM_REG_GUEST_R9, /* KDIREG_R9 */
+ VM_REG_GUEST_RAX, /* KDIREG_RAX */
+ VM_REG_GUEST_RBX, /* KDIREG_RBX */
+ VM_REG_GUEST_RBP, /* KDIREG_RBP */
+ VM_REG_GUEST_R10, /* KDIREG_R10 */
+ VM_REG_GUEST_R11, /* KDIREG_R11 */
+ VM_REG_GUEST_R12, /* KDIREG_R12 */
+ VM_REG_GUEST_R13, /* KDIREG_R13 */
+ VM_REG_GUEST_R14, /* KDIREG_R14 */
+ VM_REG_GUEST_R15, /* KDIREG_R15 */
+ VM_REG_LAST, /* KDIREG_FSBASE */
+ VM_REG_LAST, /* KDIREG_GSBASE */
+ VM_REG_LAST, /* KDIREG_KGSBASE */
+ VM_REG_GUEST_CR2, /* KDIREG_CR2 */
+ VM_REG_GUEST_CR3, /* KDIREG_CR3 */
+ VM_REG_GUEST_DS, /* KDIREG_DS */
+ VM_REG_GUEST_ES, /* KDIREG_ES */
+ VM_REG_GUEST_FS, /* KDIREG_FS */
+ VM_REG_GUEST_GS, /* KDIREG_GS */
+ VM_REG_LAST, /* KDIREG_TRAPNO */
+ VM_REG_LAST, /* KDIREG_ERR */
+ VM_REG_GUEST_RIP, /* KDIREG_RIP */
+ VM_REG_GUEST_CS, /* KDIREG_CS */
+ VM_REG_GUEST_RFLAGS, /* KDIREG_RFLAGS */
+ VM_REG_GUEST_RSP, /* KDIREG_RSP */
+ VM_REG_GUEST_SS /* KDIREG_SS */
+};
+CTASSERT(ARRAY_SIZE(vmm_kdi_regmap) == KDIREG_NGREG);
+
+/*
+ * Mapping of libvmm-defined registers to vmmapi-defined registers.
+ *
+ * This array must be kept in sync with the definitions in libvmm.h
+ */
+static int vmm_sys_regmap[] = {
+ VM_REG_GUEST_CR0, /* VMM_REG_CR0 */
+ VM_REG_GUEST_CR2, /* VMM_REG_CR2 */
+ VM_REG_GUEST_CR3, /* VMM_REG_CR3 */
+ VM_REG_GUEST_CR4, /* VMM_REG_CR4 */
+ VM_REG_GUEST_DR0, /* VMM_REG_DR0 */
+ VM_REG_GUEST_DR1, /* VMM_REG_DR1 */
+ VM_REG_GUEST_DR2, /* VMM_REG_DR2 */
+ VM_REG_GUEST_DR3, /* VMM_REG_DR3 */
+ VM_REG_GUEST_DR6, /* VMM_REG_DR6 */
+ VM_REG_GUEST_DR7, /* VMM_REG_DR7 */
+ VM_REG_GUEST_EFER, /* VMM_REG_EFER */
+ VM_REG_GUEST_PDPTE0, /* VMM_REG_PDPTE0 */
+ VM_REG_GUEST_PDPTE1, /* VMM_REG_PDPTE1 */
+ VM_REG_GUEST_PDPTE2, /* VMM_REG_PDPTE2 */
+ VM_REG_GUEST_PDPTE3, /* VMM_REG_PDPTE3 */
+ VM_REG_GUEST_INTR_SHADOW, /* VMM_REG_INTR_SHADOW */
+};
+
+/*
+ * Mapping of libvmm-defined descriptors to vmmapi-defined descriptors.
+ *
+ * This array must be kept in sync with the definitions in libvmm.h
+ */
+static int vmm_descmap[] = {
+ VM_REG_GUEST_GDTR,
+ VM_REG_GUEST_LDTR,
+ VM_REG_GUEST_IDTR,
+ VM_REG_GUEST_TR,
+ VM_REG_GUEST_CS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS,
+ VM_REG_GUEST_SS
+};
+
+static int
+vmm_mapreg(int reg)
+{
+ errno = 0;
+
+ if (reg < 0)
+ goto fail;
+
+ if (reg < KDIREG_NGREG)
+ return (vmm_kdi_regmap[reg]);
+
+ if (reg >= VMM_REG_OFFSET &&
+ reg < VMM_REG_OFFSET + ARRAY_SIZE(vmm_sys_regmap))
+ return (vmm_sys_regmap[reg - VMM_REG_OFFSET]);
+
+fail:
+ errno = EINVAL;
+ return (VM_REG_LAST);
+}
+
+static int
+vmm_mapdesc(int desc)
+{
+ errno = 0;
+
+ if (desc >= VMM_DESC_OFFSET &&
+ desc < VMM_DESC_OFFSET + ARRAY_SIZE(vmm_descmap))
+ return (vmm_descmap[desc - VMM_DESC_OFFSET]);
+
+ errno = EINVAL;
+ return (VM_REG_LAST);
+}
+
+int
+vmm_getreg(vmm_t *vmm, int vcpu, int reg, uint64_t *val)
+{
+ reg = vmm_mapreg(reg);
+
+ if (reg == VM_REG_LAST)
+ return (-1);
+
+ return (vm_get_register(vmm->vmm_ctx, vcpu, reg, val));
+}
+
+int
+vmm_setreg(vmm_t *vmm, int vcpu, int reg, uint64_t val)
+{
+ reg = vmm_mapreg(reg);
+
+ if (reg == VM_REG_LAST)
+ return (-1);
+
+ return (vm_set_register(vmm->vmm_ctx, vcpu, reg, val));
+}
+
+int
+vmm_get_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums,
+ uint64_t *regvals)
+{
+ int *vm_regnums;
+ int i;
+ int ret = -1;
+
+ vm_regnums = malloc(sizeof (int) * nregs);
+ if (vm_regnums == NULL)
+ return (ret);
+
+ for (i = 0; i != nregs; i++) {
+ vm_regnums[i] = vmm_mapreg(regnums[i]);
+ if (vm_regnums[i] == VM_REG_LAST)
+ goto fail;
+ }
+
+ ret = vm_get_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums,
+ regvals);
+
+fail:
+ free(vm_regnums);
+ return (ret);
+}
+
+int
+vmm_set_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums,
+ uint64_t *regvals)
+{
+ int *vm_regnums;
+ int i;
+ int ret = -1;
+
+ vm_regnums = malloc(sizeof (int) * nregs);
+ if (vm_regnums == NULL)
+ return (ret);
+
+ for (i = 0; i != nregs; i++) {
+ vm_regnums[i] = vmm_mapreg(regnums[i]);
+ if (vm_regnums[i] == VM_REG_LAST)
+ goto fail;
+ }
+
+ ret = vm_set_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums,
+ regvals);
+
+fail:
+ free(vm_regnums);
+ return (ret);
+}
+
+int
+vmm_get_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd)
+{
+ desc = vmm_mapdesc(desc);
+ if (desc == VM_REG_LAST)
+ return (-1);
+
+ return (vm_get_desc(vmm->vmm_ctx, vcpu, desc, &vd->vd_base, &vd->vd_lim,
+ &vd->vd_acc));
+}
+
+int
+vmm_set_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd)
+{
+ desc = vmm_mapdesc(desc);
+ if (desc == VM_REG_LAST)
+ return (-1);
+
+ return (vm_set_desc(vmm->vmm_ctx, vcpu, desc, vd->vd_base, vd->vd_lim,
+ vd->vd_acc));
+}
+
+/*
+ * Structure to hold MMU state during address translation.
+ * The contents of vmm_mmu_regnum[] must be kept in sync with this.
+ */
+typedef struct vmm_mmu {
+ uint64_t vm_cr0;
+ uint64_t vm_cr3;
+ uint64_t vm_cr4;
+ uint64_t vm_efer;
+} vmm_mmu_t;
+
+static const int vmm_mmu_regnum[] = {
+ VMM_REG_CR0,
+ VMM_REG_CR3,
+ VMM_REG_CR4,
+ VMM_REG_EFER
+};
+
+#define X86_PTE_P 0x001ULL
+#define X86_PTE_PS 0x080ULL
+
+#define X86_PTE_PHYSMASK 0x000ffffffffff000ULL
+#define X86_PAGE_SHIFT 12
+#define X86_PAGE_SIZE (1ULL << X86_PAGE_SHIFT)
+
+#define X86_SEG_CODE_DATA (1ULL << 4)
+#define X86_SEG_PRESENT (1ULL << 7)
+#define X86_SEG_LONG (1ULL << 13)
+#define X86_SEG_BIG (1ULL << 14)
+#define X86_SEG_GRANULARITY (1ULL << 15)
+#define X86_SEG_UNUSABLE (1ULL << 16)
+
+#define X86_SEG_USABLE (X86_SEG_PRESENT | X86_SEG_CODE_DATA)
+#define X86_SEG_USABLE_MASK (X86_SEG_UNUSABLE | X86_SEG_USABLE)
+
+/*
+ * vmm_pte2paddr:
+ *
+ * Recursively calculate the physical address from a virtual address,
+ * starting at the given PTE level using the given PTE.
+ */
+static int
+vmm_pte2paddr(vmm_t *vmm, uint64_t pte, boolean_t ia32, int level,
+ uint64_t vaddr, uint64_t *paddr)
+{
+ int pte_size = ia32 ? sizeof (uint32_t) : sizeof (uint64_t);
+ int off_bits = ia32 ? 10 : 9;
+ boolean_t hugepage = B_FALSE;
+ uint64_t offset;
+ uint64_t off_mask, off_shift;
+
+ if (level < 4 && (pte & X86_PTE_P) == 0) {
+ errno = EFAULT;
+ return (-1);
+ }
+
+ off_shift = X86_PAGE_SHIFT + off_bits * level;
+ off_mask = (1ULL << off_shift) - 1;
+
+ offset = vaddr & off_mask;
+
+ if ((level == 1 || level == 2) && (pte & X86_PTE_PS) != 0) {
+ hugepage = B_TRUE;
+ } else {
+ if (level > 0) {
+ offset >>= off_shift - off_bits;
+ offset <<= X86_PAGE_SHIFT - off_bits;
+ }
+ off_mask = 0xfff;
+ }
+
+ *paddr = (pte & X86_PTE_PHYSMASK & ~off_mask) + offset;
+
+ if (level == 0 || hugepage)
+ return (0);
+
+ pte = 0;
+ if (vmm_pread(vmm, &pte, pte_size, *paddr) != pte_size)
+ return (-1);
+ return (vmm_pte2paddr(vmm, pte, ia32, level - 1, vaddr, paddr));
+}
+
+static vmm_mode_t
+vmm_vcpu_mmu_mode(vmm_t *vmm, int vcpu, vmm_mmu_t *mmu)
+{
+ if ((mmu->vm_cr0 & CR0_PE) == 0)
+ return (VMM_MODE_REAL);
+ else if ((mmu->vm_cr4 & CR4_PAE) == 0)
+ return (VMM_MODE_PROT);
+ else if ((mmu->vm_efer & AMD_EFER_LME) == 0)
+ return (VMM_MODE_PAE);
+ else
+ return (VMM_MODE_LONG);
+}
+
+vmm_mode_t
+vmm_vcpu_mode(vmm_t *vmm, int vcpu)
+{
+ vmm_mmu_t mmu = { 0 };
+
+ if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum),
+ vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
+ return (VMM_MODE_UNKNOWN);
+
+ return (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu));
+}
+
+vmm_isa_t
+vmm_vcpu_isa(vmm_t *vmm, int vcpu)
+{
+ vmm_desc_t cs;
+
+ if (vmm_get_desc(vmm, vcpu, VMM_DESC_CS, &cs) != 0)
+ return (VMM_ISA_UNKNOWN);
+
+ switch (cs.vd_acc & (X86_SEG_BIG | X86_SEG_LONG)) {
+ case 0x0: /* 16b code segment */
+ return (VMM_ISA_16);
+ case X86_SEG_LONG: /* 64b code segment */
+ return (VMM_ISA_64);
+ case X86_SEG_BIG: /* 32b code segment */
+ return (VMM_ISA_32);
+ }
+
+ return (VMM_ISA_UNKNOWN);
+}
+
+/*
+ * vmm_vtol:
+ *
+ * Translate a virtual address to a physical address on a certain vCPU,
+ * using the specified segment register or descriptor according to the mode.
+ *
+ */
+int
+vmm_vtol(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *laddr)
+{
+ vmm_desc_t desc;
+ uint64_t limit;
+
+ if (vmm_get_desc(vmm, vcpu, seg, &desc) != 0)
+ return (-1);
+
+ switch (vmm_vcpu_mode(vmm, vcpu)) {
+ case VMM_MODE_REAL:
+ if (seg == VMM_DESC_FS || seg == VMM_DESC_GS)
+ goto fault;
+ /* FALLTHRU */
+ case VMM_MODE_PROT:
+ case VMM_MODE_PAE:
+ if ((desc.vd_acc & X86_SEG_USABLE_MASK) != X86_SEG_USABLE)
+ /* unusable, system segment, or not present */
+ goto fault;
+
+ limit = desc.vd_lim;
+ if (desc.vd_acc & X86_SEG_GRANULARITY)
+ limit *= 4096;
+
+ if (vaddr > limit)
+ goto fault;
+ /* FALLTHRU */
+ case VMM_MODE_LONG:
+ *laddr = desc.vd_base + vaddr;
+ return (0);
+
+ default:
+ fault:
+ errno = EFAULT;
+ return (-1);
+ }
+
+}
+
+/*
+ * vmm_vtop:
+ *
+ * Translate a virtual address to a guest physical address on a certain vCPU,
+ * according to the mode the vCPU is in.
+ */
+int
+vmm_vtop(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *paddr)
+{
+ vmm_mmu_t mmu = { 0 };
+ int ret = 0;
+
+ if (vmm_vtol(vmm, vcpu, seg, vaddr, &vaddr) != 0)
+ return (-1);
+
+ if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum),
+ vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
+ return (-1);
+
+ if ((mmu.vm_cr0 & CR0_PG) == 0) {
+ /* no paging, physical equals virtual */
+ *paddr = vaddr;
+ return (0);
+ }
+
+ switch (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu)) {
+ case VMM_MODE_PROT:
+ /* protected mode, no PAE: 2-level paging, 32bit PTEs */
+ ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_TRUE, 2, vaddr, paddr);
+ break;
+ case VMM_MODE_PAE:
+ /* protected mode with PAE: 3-level paging, 64bit PTEs */
+ ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 3, vaddr, paddr);
+ break;
+ case VMM_MODE_LONG:
+ /* long mode: 4-level paging, 64bit PTEs */
+ ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 4, vaddr, paddr);
+ break;
+ default:
+ ret = -1;
+ }
+
+ return (ret);
+}
+
+ssize_t
+vmm_vread(vmm_t *vmm, int vcpu, int seg, void *buf, size_t len, uintptr_t addr)
+{
+ ssize_t res = 0;
+ uint64_t paddr;
+ size_t plen;
+ uint64_t boundary;
+
+ while (len != 0) {
+ if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) {
+ errno = EFAULT;
+ return (0);
+ }
+
+ boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
+ if (addr + len > boundary)
+ plen = boundary - addr;
+ else
+ plen = len;
+
+ if (vmm_pread(vmm, buf, plen, paddr) != plen)
+ return (0);
+ len -= plen;
+ addr += plen;
+ buf += plen;
+ res += plen;
+ }
+
+ return (res);
+}
+
+ssize_t
+vmm_vwrite(vmm_t *vmm, int vcpu, int seg, const void *buf, size_t len,
+ uintptr_t addr)
+{
+ ssize_t res = 0;
+ uint64_t paddr;
+ size_t plen;
+ uint64_t boundary;
+
+ while (len != 0) {
+ if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) {
+ errno = EFAULT;
+ return (0);
+ }
+
+ boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
+ if (addr + len > boundary)
+ plen = boundary - addr;
+ else
+ plen = len;
+
+ if (vmm_pwrite(vmm, buf, plen, paddr) != plen)
+ return (0);
+ len -= plen;
+ addr += plen;
+ buf += plen;
+ res += plen;
+ }
+
+ return (res);
+}
diff --git a/usr/src/lib/libvmm/libvmm.h b/usr/src/lib/libvmm/libvmm.h
new file mode 100644
index 0000000000..49a721b486
--- /dev/null
+++ b/usr/src/lib/libvmm/libvmm.h
@@ -0,0 +1,122 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _LIBVMM_H
+#define _LIBVMM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vmm vmm_t;
+
+typedef struct vmm_desc {
+ uint64_t vd_base;
+ uint32_t vd_lim;
+ uint32_t vd_acc;
+} vmm_desc_t;
+
+
+/*
+ * This enum must be kept in sync with vmm_sys_regmap[] in libvmm.c.
+ */
+#define VMM_REG_OFFSET 0x100
+enum vmm_regs {
+ VMM_REG_CR0 = VMM_REG_OFFSET,
+ VMM_REG_CR2,
+ VMM_REG_CR3,
+ VMM_REG_CR4,
+ VMM_REG_DR0,
+ VMM_REG_DR1,
+ VMM_REG_DR2,
+ VMM_REG_DR3,
+ VMM_REG_DR6,
+ VMM_REG_DR7,
+ VMM_REG_EFER,
+ VMM_REG_PDPTE0,
+ VMM_REG_PDPTE1,
+ VMM_REG_PDPTE2,
+ VMM_REG_PDPTE3,
+ VMM_REG_INTR_SHADOW
+};
+
+/*
+ * This enum must be kept in sync with vmm_descmap[] in libvmm.c.
+ */
+#define VMM_DESC_OFFSET 0x200
+enum vmm_descs {
+ VMM_DESC_GDTR = VMM_DESC_OFFSET,
+ VMM_DESC_LDTR,
+ VMM_DESC_IDTR,
+ VMM_DESC_TR,
+ VMM_DESC_CS,
+ VMM_DESC_DS,
+ VMM_DESC_ES,
+ VMM_DESC_FS,
+ VMM_DESC_GS,
+ VMM_DESC_SS
+};
+
+typedef enum {
+ VMM_MODE_UNKNOWN = 0,
+ VMM_MODE_REAL,
+ VMM_MODE_PROT,
+ VMM_MODE_PAE,
+ VMM_MODE_LONG
+} vmm_mode_t;
+
+typedef enum {
+ VMM_ISA_UNKNOWN = 0,
+ VMM_ISA_16,
+ VMM_ISA_32,
+ VMM_ISA_64
+} vmm_isa_t;
+
+vmm_t *vmm_open_vm(const char *);
+void vmm_close_vm(vmm_t *);
+
+int vmm_map(vmm_t *, boolean_t);
+void vmm_unmap(vmm_t *);
+
+ssize_t vmm_pread(vmm_t *, void *, size_t, uintptr_t);
+ssize_t vmm_pwrite(vmm_t *, const void *, size_t, uintptr_t);
+ssize_t vmm_vread(vmm_t *, int, int, void *, size_t, uintptr_t);
+ssize_t vmm_vwrite(vmm_t *, int, int, const void *, size_t, uintptr_t);
+
+size_t vmm_ncpu(vmm_t *);
+size_t vmm_memsize(vmm_t *);
+
+void vmm_cont(vmm_t *);
+int vmm_step(vmm_t *, int);
+void vmm_stop(vmm_t *);
+
+int vmm_getreg(vmm_t *, int, int, uint64_t *);
+int vmm_setreg(vmm_t *, int, int, uint64_t);
+int vmm_get_regset(vmm_t *, int, size_t, const int *, uint64_t *);
+int vmm_set_regset(vmm_t *, int, size_t, const int *, uint64_t *);
+
+int vmm_get_desc(vmm_t *, int, int, vmm_desc_t *);
+int vmm_set_desc(vmm_t *, int, int, vmm_desc_t *);
+
+vmm_mode_t vmm_vcpu_mode(vmm_t *, int);
+vmm_isa_t vmm_vcpu_isa(vmm_t *, int);
+int vmm_vtol(vmm_t *, int, int, uint64_t, uint64_t *);
+int vmm_vtop(vmm_t *, int, int, uint64_t, uint64_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBVMM_H */
diff --git a/usr/src/lib/libvmm/mapfile-vers b/usr/src/lib/libvmm/mapfile-vers
new file mode 100644
index 0000000000..19a15802ac
--- /dev/null
+++ b/usr/src/lib/libvmm/mapfile-vers
@@ -0,0 +1,60 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+ global:
+ vmm_open_vm;
+ vmm_close_vm;
+ vmm_map;
+ vmm_unmap;
+ vmm_pread;
+ vmm_pwrite;
+ vmm_ncpu;
+ vmm_memsize;
+ vmm_cont;
+ vmm_step;
+ vmm_stop;
+ vmm_setreg;
+ vmm_getreg;
+ vmm_set_regset;
+ vmm_get_regset;
+ vmm_set_desc;
+ vmm_get_desc;
+ vmm_vcpu_isa;
+ vmm_vcpu_mode;
+ vmm_vtol;
+ vmm_vtop;
+ vmm_vread;
+ vmm_vwrite;
+
+ local:
+ *;
+};