summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Mooney <pmooney@pfmooney.com>2021-05-28 21:07:11 +0000
committerPatrick Mooney <pmooney@oxide.computer>2021-07-27 19:26:22 +0000
commitb57f5d3e6a2df8d435e606797cf3934811848343 (patch)
tree31d0b366057848a88837b15524905a703c3bdf9c
parented1e93792d7c9ea04a0cb44cffe34c24c135b002 (diff)
downloadillumos-joyent-b57f5d3e6a2df8d435e606797cf3934811848343.tar.gz
13833 want bhyve memory reservoir
13822 bhyve memory should exert memory pressure 13834 want extensible page_resv 13821 vmmctl ioctls should have more structure Reviewed by: Andy Fiddaman <andy@omnios.org> Reviewed by: Jason King <jason.brian.king@gmail.com> Reviewed by: Dan Cross <cross@oxidecomputer.com> Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Reviewed by: Mike Zeller <mike.zeller@joyent.com> Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r--usr/src/cmd/Makefile1
-rw-r--r--usr/src/cmd/bhyve/bhyverun.c11
-rw-r--r--usr/src/cmd/bhyvectl/bhyvectl.c7
-rw-r--r--usr/src/cmd/rsrvrctl/Makefile48
-rw-r--r--usr/src/cmd/rsrvrctl/rsrvrctl.c164
-rw-r--r--usr/src/lib/libvmmapi/common/vmmapi.c106
-rw-r--r--usr/src/lib/libvmmapi/common/vmmapi.h6
-rw-r--r--usr/src/pkg/manifests/system-bhyve.mf1
-rw-r--r--usr/src/uts/common/vm/page.h12
-rw-r--r--usr/src/uts/common/vm/vm_page.c72
-rw-r--r--usr/src/uts/i86pc/Makefile.files3
-rw-r--r--usr/src/uts/i86pc/io/vmm/seg_vmm.c (renamed from usr/src/uts/i86pc/vm/seg_vmm.c)91
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h30
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/vmm_impl.h2
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h2
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/vmm_reservoir.h40
-rw-r--r--usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h7
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm.c22
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_reservoir.c820
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c103
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c150
-rw-r--r--usr/src/uts/i86pc/sys/vmm.h26
-rw-r--r--usr/src/uts/i86pc/sys/vmm_dev.h27
-rw-r--r--usr/src/uts/i86pc/vm/seg_vmm.h50
24 files changed, 1441 insertions, 360 deletions
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile
index 22afdf737a..8159ad677b 100644
--- a/usr/src/cmd/Makefile
+++ b/usr/src/cmd/Makefile
@@ -497,6 +497,7 @@ i386_SUBDIRS= \
nvmeadm \
pptadm \
rdmsr \
+ rsrvrctl \
rtc \
ucodeadm \
xhci \
diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c
index c777529858..b127c7cadc 100644
--- a/usr/src/cmd/bhyve/bhyverun.c
+++ b/usr/src/cmd/bhyve/bhyverun.c
@@ -39,7 +39,7 @@
*
* Copyright 2015 Pluribus Networks Inc.
* Copyright 2018 Joyent, Inc.
- * Copyright 2020 Oxide Computer Company
+ * Copyright 2021 Oxide Computer Company
*/
#include <sys/cdefs.h>
@@ -1253,8 +1253,15 @@ do_open(const char *vmname)
if (lpc_bootrom())
romboot = true;
-
+#ifndef __FreeBSD__
+ uint64_t create_flags = 0;
+ if (get_config_bool_default("memory.use_reservoir", false)) {
+ create_flags |= VCF_RESERVOIR_MEM;
+ }
+ error = vm_create(vmname, create_flags);
+#else
error = vm_create(vmname);
+#endif /* __FreeBSD__ */
if (error) {
if (errno == EEXIST) {
if (romboot) {
diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c
index 313a1a37f4..4fc6ddc251 100644
--- a/usr/src/cmd/bhyvectl/bhyvectl.c
+++ b/usr/src/cmd/bhyvectl/bhyvectl.c
@@ -39,7 +39,7 @@
*
* Copyright 2015 Pluribus Networks Inc.
* Copyright 2019 Joyent, Inc.
- * Copyright 2020 Oxide Computer Company
+ * Copyright 2021 Oxide Computer Company
*/
#include <sys/cdefs.h>
@@ -1953,8 +1953,13 @@ main(int argc, char *argv[])
error = 0;
+#ifndef __FreeBSD__
+ if (!error && create)
+ error = vm_create(vmname, 0);
+# else
if (!error && create)
error = vm_create(vmname);
+#endif /* __FreeBSD__ */
if (!error) {
ctx = vm_open(vmname);
diff --git a/usr/src/cmd/rsrvrctl/Makefile b/usr/src/cmd/rsrvrctl/Makefile
new file mode 100644
index 0000000000..f51df92730
--- /dev/null
+++ b/usr/src/cmd/rsrvrctl/Makefile
@@ -0,0 +1,48 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2021 Oxide Computer Company
+#
+
+PROG = rsrvrctl
+
+include ../Makefile.cmd
+include ../Makefile.cmd.64
+include ../Makefile.ctf
+
+SRCS = rsrvrctl.c
+OBJS = $(SRCS:.c=.o)
+
+CLEANFILES = $(PROG)
+CLOBBERFILES += $(ROOTUSRSBINPROG)
+
+.KEEP_STATE:
+
+CFLAGS += $(CCVERBOSE)
+CPPFLAGS = -I$(COMPAT)/bhyve -I$(CONTRIB)/bhyve \
+ -I$(COMPAT)/bhyve/amd64 -I$(CONTRIB)/bhyve/amd64 \
+ $(CPPFLAGS.master) \
+ -I$(SRC)/uts/i86pc/io/vmm \
+ -I$(SRC)/uts/i86pc
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+ $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
+ $(POST_PROCESS)
+
+install: all $(ROOTLIBPROG)
+
+clean:
+ $(RM) $(OBJS) $(CLEANFILES)
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/rsrvrctl/rsrvrctl.c b/usr/src/cmd/rsrvrctl/rsrvrctl.c
new file mode 100644
index 0000000000..e189520a1c
--- /dev/null
+++ b/usr/src/cmd/rsrvrctl/rsrvrctl.c
@@ -0,0 +1,164 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#include <sys/vmm_dev.h>
+
+static void
+usage(const char *pname)
+{
+ fprintf(stderr,
+ "Usage: %s [-a add] [-r remove] [-q]\n"
+ "\t-a <SZ> add SZ MiB to the reservoir\n"
+ "\t-r <SZ> remove SZ MiB from the reservoir\n"
+ "\t-q query reservoir state\n", pname);
+}
+
+static bool
+parse_size(const char *arg, size_t *resp)
+{
+ size_t res;
+
+ errno = 0;
+ res = strtoul(arg, NULL, 0);
+ if (errno != 0) {
+ return (false);
+ }
+
+ *resp = (res * 1024 * 1024);
+ return (true);
+}
+
+static void
+do_add(int fd, size_t sz)
+{
+ int res;
+
+ res = ioctl(fd, VMM_RESV_ADD, sz);
+ if (res != 0) {
+ perror("Could not add to reservoir");
+ exit(EXIT_FAILURE);
+ }
+}
+
+static void
+do_remove(int fd, size_t sz)
+{
+ int res;
+
+ res = ioctl(fd, VMM_RESV_REMOVE, sz);
+ if (res != 0) {
+ perror("Could not remove from reservoir");
+ exit(EXIT_FAILURE);
+ }
+}
+
+static void
+do_query(int fd)
+{
+ struct vmm_resv_query data;
+ int res;
+
+ res = ioctl(fd, VMM_RESV_QUERY, &data);
+ if (res != 0) {
+ perror("Could not query reservoir info");
+ return;
+ }
+
+ printf("Free KiB:\t%llu\n"
+ "Allocated KiB:\t%llu\n"
+ "Transient Allocated KiB:\t%llu\n"
+ "Size limit KiB:\t%llu\n",
+ data.vrq_free_sz / 1024,
+ data.vrq_alloc_sz / 1024,
+ data.vrq_alloc_transient_sz / 1024,
+ data.vrq_limit / 1024);
+}
+
+int
+main(int argc, char *argv[])
+{
+ char c;
+ const char *opt_a = NULL, *opt_r = NULL;
+ bool opt_q = false;
+ int fd;
+
+ const char *pname = argv[0];
+
+ while ((c = getopt(argc, argv, "a:r:qh")) != -1) {
+ switch (c) {
+ case 'a':
+ opt_a = optarg;
+ break;
+ case 'r':
+ opt_r = optarg;
+ break;
+ case 'q':
+ opt_q = true;
+ break;
+ case 'h':
+ usage(pname);
+ return (EXIT_SUCCESS);
+ default:
+ usage(pname);
+ return (EXIT_FAILURE);
+ }
+ }
+ if (optind < argc ||
+ (opt_a == NULL && opt_r == NULL && !opt_q) ||
+ (opt_a != NULL && opt_r != NULL)) {
+ usage(pname);
+ return (EXIT_FAILURE);
+ }
+
+ fd = open(VMM_CTL_DEV, O_EXCL | O_RDWR);
+ if (fd < 0) {
+ perror("Could not open vmmctl");
+ usage(pname);
+ return (EXIT_FAILURE);
+ }
+
+ if (opt_a != NULL) {
+ size_t sz;
+
+ if (!parse_size(opt_a, &sz)) {
+ perror("Invalid size");
+ usage(pname);
+ return (EXIT_FAILURE);
+ }
+
+ do_add(fd, sz);
+ }
+ if (opt_r != NULL) {
+ size_t sz;
+
+ if (!parse_size(opt_r, &sz)) {
+ perror("Invalid size");
+ usage(pname);
+ return (EXIT_FAILURE);
+ }
+ do_remove(fd, sz);
+ }
+ if (opt_q) {
+ do_query(fd);
+ }
+
+ (void) close(fd);
+ return (0);
+}
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c
index ba3fb7f8dd..ec27949a43 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.c
+++ b/usr/src/lib/libvmmapi/common/vmmapi.c
@@ -39,7 +39,7 @@
*
* Copyright 2015 Pluribus Networks Inc.
* Copyright 2019 Joyent, Inc.
- * Copyright 2020 Oxide Computer Company
+ * Copyright 2021 Oxide Computer Company
*/
#include <sys/cdefs.h>
@@ -109,12 +109,31 @@ struct vmctx {
#ifdef __FreeBSD__
#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
-#else
-#define CREATE(x) vm_do_ctl(VMM_CREATE_VM, (x))
-#define DESTROY(x) vm_do_ctl(VMM_DESTROY_VM, (x))
+int
+vm_create(const char *name)
+{
+ /* Try to load vmm(4) module before creating a guest. */
+ if (modfind("vmm") < 0)
+ kldload("vmm");
+ return (CREATE((char *)name));
+}
+
+void
+vm_destroy(struct vmctx *vm)
+{
+ assert(vm != NULL);
+
+ if (vm->fd >= 0)
+ close(vm->fd);
+ DESTROY(vm->name);
+
+ free(vm);
+}
+
+#else
static int
-vm_do_ctl(int cmd, const char *name)
+vm_do_ctl(int cmd, void *req)
{
int ctl_fd;
@@ -123,7 +142,7 @@ vm_do_ctl(int cmd, const char *name)
return (-1);
}
- if (ioctl(ctl_fd, cmd, name) == -1) {
+ if (ioctl(ctl_fd, cmd, req) == -1) {
int err = errno;
/* Do not lose ioctl errno through the close(2) */
@@ -135,6 +154,46 @@ vm_do_ctl(int cmd, const char *name)
return (0);
}
+
+int
+vm_create(const char *name, uint64_t flags)
+{
+ struct vm_create_req req;
+
+ (void) strncpy(req.name, name, VM_MAX_NAMELEN);
+ req.flags = flags;
+
+ return (vm_do_ctl(VMM_CREATE_VM, &req));
+}
+
+void
+vm_close(struct vmctx *vm)
+{
+ assert(vm != NULL);
+ assert(vm->fd >= 0);
+
+ (void) close(vm->fd);
+
+ free(vm);
+}
+
+void
+vm_destroy(struct vmctx *vm)
+{
+ struct vm_destroy_req req;
+
+ assert(vm != NULL);
+
+ if (vm->fd >= 0) {
+ (void) close(vm->fd);
+ vm->fd = -1;
+ }
+
+ (void) strncpy(req.name, vm->name, VM_MAX_NAMELEN);
+ (void) vm_do_ctl(VMM_DESTROY_VM, &req);
+
+ free(vm);
+}
#endif
static int
@@ -155,17 +214,6 @@ vm_device_open(const char *name)
return (fd);
}
-int
-vm_create(const char *name)
-{
-#ifdef __FreeBSD__
- /* Try to load vmm(4) module before creating a guest. */
- if (modfind("vmm") < 0)
- kldload("vmm");
-#endif
- return (CREATE((char *)name));
-}
-
struct vmctx *
vm_open(const char *name)
{
@@ -189,30 +237,6 @@ err:
return (NULL);
}
-#ifndef __FreeBSD__
-void
-vm_close(struct vmctx *vm)
-{
- assert(vm != NULL);
- assert(vm->fd >= 0);
-
- (void) close(vm->fd);
-
- free(vm);
-}
-#endif
-
-void
-vm_destroy(struct vmctx *vm)
-{
- assert(vm != NULL);
-
- if (vm->fd >= 0)
- close(vm->fd);
- DESTROY(vm->name);
-
- free(vm);
-}
int
vm_parse_memsize(const char *optarg, size_t *ret_memsize)
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h
index 79c7dc02ee..e239b70a56 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.h
+++ b/usr/src/lib/libvmmapi/common/vmmapi.h
@@ -39,7 +39,7 @@
*
* Copyright 2015 Pluribus Networks Inc.
* Copyright 2019 Joyent, Inc.
- * Copyright 2020 Oxide Computer Company
+ * Copyright 2021 Oxide Computer Company
*/
#ifndef _VMMAPI_H_
@@ -134,7 +134,11 @@ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid,
int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
+#ifndef __FreeBSD__
+int vm_create(const char *name, uint64_t flags);
+#else
int vm_create(const char *name);
+#endif /* __FreeBSD__ */
int vm_get_device_fd(struct vmctx *ctx);
struct vmctx *vm_open(const char *name);
#ifndef __FreeBSD__
diff --git a/usr/src/pkg/manifests/system-bhyve.mf b/usr/src/pkg/manifests/system-bhyve.mf
index 0495d9f649..3f67fa743e 100644
--- a/usr/src/pkg/manifests/system-bhyve.mf
+++ b/usr/src/pkg/manifests/system-bhyve.mf
@@ -48,6 +48,7 @@ file path=usr/kernel/drv/$(ARCH64)/vmm
file path=usr/kernel/drv/ppt.conf
file path=usr/kernel/drv/viona.conf
file path=usr/kernel/drv/vmm.conf
+file path=usr/lib/rsrvrctl mode=0555
file path=usr/sbin/bhyve mode=0555
file path=usr/sbin/bhyvectl mode=0555
file path=usr/sbin/pptadm mode=0555
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 8747b96acc..5b98acd24f 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2021 Oxide Computer Company
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -340,13 +341,13 @@ struct as;
*
* So, as a quick summary:
*
- * pse_mutex[]'s protect the p_selock and p_cv fields.
+ * pse_mutex[]'s protect the p_selock and p_cv fields.
*
- * p_selock protects the p_free, p_age, p_vnode, p_offset and p_hash,
+ * p_selock protects the p_free, p_age, p_vnode, p_offset and p_hash,
*
- * ph_mutex[]'s protect the page_hash[] array and its chains.
+ * ph_mutex[]'s protect the page_hash[] array and its chains.
*
- * vph_mutex[]'s protect the v_pages field and the vp page chains.
+ * vph_mutex[]'s protect the v_pages field and the vp page chains.
*
* First lock the page, then the hash chain, then the vnode chain. When
* this is not possible `trylocks' must be used. Sleeping while holding
@@ -762,6 +763,7 @@ void page_lock_delete(page_t *);
int page_deleted(page_t *);
int page_pp_lock(page_t *, int, int);
void page_pp_unlock(page_t *, int, int);
+int page_xresv(pgcnt_t, uint_t, int (*)(void));
int page_resv(pgcnt_t, uint_t);
void page_unresv(pgcnt_t);
void page_pp_useclaim(page_t *, page_t *, uint_t);
@@ -1078,7 +1080,7 @@ typedef struct kpm_hlk {
* The state about how a kpm page is mapped and whether it is ready to go
* is indicated by the following 1 byte kpm_spage structure. This byte is
* split into two 4-bit parts - kp_mapped and kp_mapped_go.
- * - kp_mapped == 1 the page is mapped cacheable
+ * - kp_mapped == 1 the page is mapped cacheable
* - kp_mapped == 2 the page is mapped non-cacheable
* - kp_mapped_go == 1 the mapping is ready to be dropped in
* - kp_mapped_go == 0 the mapping is not ready to be dropped in.
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index bcc6d05d47..89751b7b2c 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -23,6 +23,7 @@
* Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
* Copyright (c) 2015, 2016 by Delphix. All rights reserved.
* Copyright 2018 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -3919,29 +3920,68 @@ page_pp_unlock(
}
/*
- * This routine reserves availrmem for npages;
- * flags: KM_NOSLEEP or KM_SLEEP
- * returns 1 on success or 0 on failure
+ * This routine reserves availrmem for npages.
+ * It returns 1 on success or 0 on failure.
+ *
+ * flags: KM_NOSLEEP or KM_SLEEP
+ * cb_wait: called to induce delay when KM_SLEEP reservation requires kmem
+ * reaping to potentially succeed. If the callback returns 0, the
+ * reservation attempts will cease to repeat and page_xresv() may
+ * report a failure. If cb_wait is NULL, the traditional delay(hz/2)
+ * behavior will be used while waiting for a reap.
*/
int
-page_resv(pgcnt_t npages, uint_t flags)
+page_xresv(pgcnt_t npages, uint_t flags, int (*cb_wait)(void))
{
mutex_enter(&freemem_lock);
- while (availrmem < tune.t_minarmem + npages) {
- if (flags & KM_NOSLEEP) {
- mutex_exit(&freemem_lock);
- return (0);
- }
+ if (availrmem >= tune.t_minarmem + npages) {
+ availrmem -= npages;
mutex_exit(&freemem_lock);
- page_needfree(npages);
- kmem_reap();
- delay(hz >> 2);
- page_needfree(-(spgcnt_t)npages);
- mutex_enter(&freemem_lock);
+ return (1);
+ } else if ((flags & KM_NOSLEEP) != 0) {
+ mutex_exit(&freemem_lock);
+ return (0);
}
- availrmem -= npages;
mutex_exit(&freemem_lock);
- return (1);
+
+ /*
+ * We signal memory pressure to the system by elevating 'needfree'.
+ * Processes such as kmem reaping, pageout, and ZFS ARC shrinking can
+ * then respond to said pressure by freeing pages.
+ */
+ page_needfree(npages);
+ int nobail = 1;
+ do {
+ kmem_reap();
+ if (cb_wait == NULL) {
+ delay(hz >> 2);
+ } else {
+ nobail = cb_wait();
+ }
+
+ mutex_enter(&freemem_lock);
+ if (availrmem >= tune.t_minarmem + npages) {
+ availrmem -= npages;
+ mutex_exit(&freemem_lock);
+ page_needfree(-(spgcnt_t)npages);
+ return (1);
+ }
+ mutex_exit(&freemem_lock);
+ } while (nobail != 0);
+ page_needfree(-(spgcnt_t)npages);
+
+ return (0);
+}
+
+/*
+ * This routine reserves availrmem for npages;
+ * flags: KM_NOSLEEP or KM_SLEEP
+ * returns 1 on success or 0 on failure
+ */
+int
+page_resv(pgcnt_t npages, uint_t flags)
+{
+ return (page_xresv(npages, flags, NULL));
}
/*
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index 558d19ad3f..0a3fad877c 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -117,7 +117,6 @@ CORE_OBJS += \
ppage.o \
pwrnow.o \
smt.o \
- seg_vmm.o \
speedstep.o \
ssp.o \
startup.o \
@@ -270,6 +269,8 @@ VMM_OBJS += vmm.o \
svm_support.o \
amdv.o \
vmm_gpt.o \
+ seg_vmm.o \
+ vmm_reservoir.o \
vmm_sol_vm.o \
vmm_sol_glue.o \
vmm_sol_ept.o \
diff --git a/usr/src/uts/i86pc/vm/seg_vmm.c b/usr/src/uts/i86pc/io/vmm/seg_vmm.c
index beb5e81d53..23a8da3bc5 100644
--- a/usr/src/uts/i86pc/vm/seg_vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/seg_vmm.c
@@ -11,6 +11,7 @@
/*
* Copyright 2018 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
*/
/*
@@ -40,7 +41,16 @@
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_kmem.h>
-#include <vm/seg_vmm.h>
+
+#include <sys/seg_vmm.h>
+
+typedef struct segvmm_data {
+ krwlock_t svmd_lock;
+ vm_object_t svmd_obj;
+ uintptr_t svmd_obj_off;
+ uchar_t svmd_prot;
+ size_t svmd_softlockcnt;
+} segvmm_data_t;
static int segvmm_dup(struct seg *, struct seg *);
@@ -105,31 +115,14 @@ segvmm_create(struct seg **segpp, void *argsp)
segvmm_crargs_t *cra = argsp;
segvmm_data_t *data;
- /*
- * Check several aspects of the mapping request to ensure validity:
- * - kernel pages must reside entirely in kernel space
- * - target protection must be user-accessible
- * - kernel address must be page-aligned
- */
- if ((uintptr_t)cra->kaddr <= _userlimit ||
- ((uintptr_t)cra->kaddr + seg->s_size) < (uintptr_t)cra->kaddr ||
- (cra->prot & PROT_USER) == 0 ||
- ((uintptr_t)cra->kaddr & PAGEOFFSET) != 0) {
- return (EINVAL);
- }
-
data = kmem_zalloc(sizeof (*data), KM_SLEEP);
rw_init(&data->svmd_lock, NULL, RW_DEFAULT, NULL);
- data->svmd_kaddr = (uintptr_t)cra->kaddr;
+ data->svmd_obj = cra->obj;
+ data->svmd_obj_off = cra->offset;
data->svmd_prot = cra->prot;
- data->svmd_cookie = cra->cookie;
- data->svmd_hold = cra->hold;
- data->svmd_rele = cra->rele;
- /* Since initial checks have passed, grab a reference on the cookie */
- if (data->svmd_hold != NULL) {
- data->svmd_hold(data->svmd_cookie);
- }
+ /* Grab a hold on the VM object for the duration of this seg mapping */
+ vm_object_reference(data->svmd_obj);
seg->s_ops = &segvmm_ops;
seg->s_data = data;
@@ -146,16 +139,12 @@ segvmm_dup(struct seg *seg, struct seg *newseg)
newsvmd = kmem_zalloc(sizeof (segvmm_data_t), KM_SLEEP);
rw_init(&newsvmd->svmd_lock, NULL, RW_DEFAULT, NULL);
- newsvmd->svmd_kaddr = svmd->svmd_kaddr;
+ newsvmd->svmd_obj = svmd->svmd_obj;
+ newsvmd->svmd_obj_off = svmd->svmd_obj_off;
newsvmd->svmd_prot = svmd->svmd_prot;
- newsvmd->svmd_cookie = svmd->svmd_cookie;
- newsvmd->svmd_hold = svmd->svmd_hold;
- newsvmd->svmd_rele = svmd->svmd_rele;
/* Grab another hold for the duplicate segment */
- if (svmd->svmd_hold != NULL) {
- newsvmd->svmd_hold(newsvmd->svmd_cookie);
- }
+ vm_object_reference(svmd->svmd_obj);
newseg->s_ops = seg->s_ops;
newseg->s_data = newsvmd;
@@ -180,10 +169,8 @@ segvmm_unmap(struct seg *seg, caddr_t addr, size_t len)
/* Unconditionally unload the entire segment range. */
hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP);
- /* Release the hold this segment possessed */
- if (svmd->svmd_rele != NULL) {
- svmd->svmd_rele(svmd->svmd_cookie);
- }
+ /* Release the VM object hold this segment possessed */
+ vm_object_deallocate(svmd->svmd_obj);
seg_free(seg);
return (0);
@@ -206,41 +193,23 @@ static int
segvmm_fault_in(struct hat *hat, struct seg *seg, uintptr_t va, size_t len)
{
segvmm_data_t *svmd = seg->s_data;
- const uintptr_t koff = svmd->svmd_kaddr - (uintptr_t)seg->s_base;
const uintptr_t end = va + len;
const uintptr_t prot = svmd->svmd_prot;
- /* Stick to the simple non-large-page case for now */
va &= PAGEMASK;
-
+ uintptr_t off = va - (uintptr_t)seg->s_base;
do {
- htable_t *ht;
- uint_t entry, lvl;
- size_t psz;
pfn_t pfn;
- const uintptr_t kaddr = va + koff;
-
- ASSERT(kaddr >= (uintptr_t)svmd->svmd_kaddr);
- ASSERT(kaddr < ((uintptr_t)svmd->svmd_kaddr + seg->s_size));
- ht = htable_getpage(kas.a_hat, kaddr, &entry);
- if (ht == NULL) {
- return (-1);
- }
- lvl = ht->ht_level;
- pfn = PTE2PFN(x86pte_get(ht, entry), lvl);
- htable_release(ht);
+ pfn = vm_object_pfn(svmd->svmd_obj, off);
if (pfn == PFN_INVALID) {
return (-1);
}
- /* For the time being, handling for large pages is absent. */
- psz = PAGESIZE;
- pfn += mmu_btop(kaddr & LEVEL_OFFSET(lvl));
-
- hat_devload(hat, (caddr_t)va, psz, pfn, prot, HAT_LOAD);
-
- va = va + psz;
+ /* Ignore any large-page possibilities for now */
+ hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, prot, HAT_LOAD);
+ va += PAGESIZE;
+ off += PAGESIZE;
} while (va < end);
return (0);
@@ -399,8 +368,8 @@ static int
segvmm_gettype(struct seg *seg, caddr_t addr)
{
/*
- * Since already-existing kernel pages are being mapped into userspace,
- * always report the segment type as shared.
+ * Since already-existing vmm reservoir pages are being mapped into
+ * userspace, always report the segment type as shared.
*/
return (MAP_SHARED);
}
@@ -457,8 +426,8 @@ segvmm_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
{
segvmm_data_t *svmd = seg->s_data;
- memidp->val[0] = (uintptr_t)svmd->svmd_kaddr;
- memidp->val[1] = (uintptr_t)(addr - seg->s_base);
+ memidp->val[0] = (uintptr_t)svmd->svmd_obj;
+ memidp->val[1] = (uintptr_t)(addr - seg->s_base) + svmd->svmd_obj_off;
return (0);
}
diff --git a/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h b/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h
new file mode 100644
index 0000000000..a4f72f816e
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h
@@ -0,0 +1,30 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#ifndef _VM_SEG_VMM_H
+#define _VM_SEG_VMM_H
+
+#include <sys/vmm_vm.h>
+
+typedef struct segvmm_crargs {
+ uchar_t prot; /* protection */
+ vm_object_t obj;
+ uintptr_t offset;
+} segvmm_crargs_t;
+
+int segvmm_create(struct seg **, void *);
+
+#endif /* _VM_SEG_VMM_H */
diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_impl.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_impl.h
index 606be4bbae..2b6f41ec54 100644
--- a/usr/src/uts/i86pc/io/vmm/sys/vmm_impl.h
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_impl.h
@@ -40,7 +40,7 @@
struct vmm_devmem_entry {
list_node_t vde_node;
int vde_segid;
- char vde_name[SPECNAMELEN + 1];
+ char vde_name[VM_MAX_SEG_NAMELEN];
size_t vde_len;
off_t vde_off;
};
diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h
index 8441b51e03..4191aaee5c 100644
--- a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h
@@ -115,7 +115,7 @@ struct vmm_ops {
extern struct vmm_ops vmm_ops_intel;
extern struct vmm_ops vmm_ops_amd;
-int vm_create(const char *name, struct vm **retvm);
+int vm_create(const char *name, uint64_t flags, struct vm **retvm);
void vm_destroy(struct vm *vm);
int vm_reinit(struct vm *vm);
const char *vm_name(struct vm *vm);
diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_reservoir.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_reservoir.h
new file mode 100644
index 0000000000..b8215ce654
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_reservoir.h
@@ -0,0 +1,40 @@
+
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#ifndef _SYS_VMM_RESERVOIR_H
+#define _SYS_VMM_RESERVOIR_H
+
+#include <sys/types.h>
+#include <sys/cred.h>
+
+struct vmmr_region;
+typedef struct vmmr_region vmmr_region_t;
+
+void vmmr_init();
+void vmmr_fini();
+bool vmmr_is_empty();
+
+int vmmr_alloc(size_t, bool, vmmr_region_t **);
+void *vmmr_region_mem_at(vmmr_region_t *, uintptr_t);
+pfn_t vmmr_region_pfn_at(vmmr_region_t *, uintptr_t);
+void vmmr_free(vmmr_region_t *);
+
+int vmmr_add(size_t, bool);
+int vmmr_remove(size_t, bool);
+
+int vmmr_ioctl(int, intptr_t, int, cred_t *, int *);
+
+#endif /* _SYS_VMM_RESERVOIR_H */
diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h
index 6c7f9d423e..76d5fec8b7 100644
--- a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h
@@ -160,8 +160,6 @@ int vm_segmap_obj(vm_object_t, off_t, size_t, struct as *, caddr_t *, uint_t,
int vm_segmap_space(struct vmspace *, off_t, struct as *, caddr_t *, off_t,
uint_t, uint_t, uint_t);
void *vmspace_find_kva(struct vmspace *, uintptr_t, size_t);
-void vmm_arena_init(void);
-void vmm_arena_fini(void);
typedef int (*pmap_pinit_t)(struct pmap *pmap);
@@ -171,13 +169,12 @@ void vmspace_free(struct vmspace *);
int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
vm_prot_t prot, vm_page_t *ma, int max_count);
-void vmm_arena_fini(void);
-
-struct vm_object *vm_object_allocate(objtype_t, vm_pindex_t);
+struct vm_object *vm_object_allocate(objtype_t, vm_pindex_t, bool);
void vm_object_deallocate(vm_object_t);
void vm_object_reference(vm_object_t);
int vm_object_set_memattr(vm_object_t, vm_memattr_t);
+pfn_t vm_object_pfn(vm_object_t, uintptr_t);
#define VM_OBJECT_WLOCK(vmo) mutex_enter(&(vmo)->vmo_lock)
#define VM_OBJECT_WUNLOCK(vmo) mutex_exit(&(vmo)->vmo_lock)
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index eecff88b7d..80c9ec6bd7 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/systm.h>
+#include <sys/sunddi.h>
#include <machine/pcb.h>
#include <machine/smp.h>
@@ -191,6 +192,8 @@ struct vm {
uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */
struct ioport_config ioports; /* (o) ioport handling */
+
+ bool mem_transient; /* (o) alloc transient memory */
};
static int vmm_initialized;
@@ -490,7 +493,7 @@ uint_t cores_per_package = 1;
uint_t threads_per_core = 1;
int
-vm_create(const char *name, struct vm **retvm)
+vm_create(const char *name, uint64_t flags, struct vm **retvm)
{
struct vm *vm;
struct vmspace *vmspace;
@@ -502,8 +505,8 @@ vm_create(const char *name, struct vm **retvm)
if (!vmm_initialized)
return (ENXIO);
- if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
- return (EINVAL);
+ /* Name validation has already occurred */
+ VERIFY3U(strnlen(name, VM_MAX_NAMELEN), <, VM_MAX_NAMELEN);
vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
if (vmspace == NULL)
@@ -512,6 +515,7 @@ vm_create(const char *name, struct vm **retvm)
vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
vm->vmspace = vmspace;
+ vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0;
vm->sockets = 1;
vm->cores = cores_per_package; /* XXX backwards compatibility */
@@ -708,21 +712,12 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
struct mem_seg *seg;
vm_object_t obj;
-#ifndef __FreeBSD__
- extern pgcnt_t get_max_page_get(void);
-#endif
-
if (ident < 0 || ident >= VM_MAX_MEMSEGS)
return (EINVAL);
if (len == 0 || (len & PAGE_MASK))
return (EINVAL);
-#ifndef __FreeBSD__
- if (len > ptob(get_max_page_get()))
- return (EINVAL);
-#endif
-
seg = &vm->mem_segs[ident];
if (seg->object != NULL) {
if (seg->len == len && seg->sysmem == sysmem)
@@ -731,7 +726,8 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
return (EINVAL);
}
- obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
+ obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT,
+ vm->mem_transient);
if (obj == NULL)
return (ENOMEM);
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_reservoir.c b/usr/src/uts/i86pc/io/vmm/vmm_reservoir.c
new file mode 100644
index 0000000000..1bb64a4851
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_reservoir.c
@@ -0,0 +1,820 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+/*
+ * VMM Memory Reservoir
+ *
+ *
+ * In order to make the allocation of large (multi-GiB) chunks of memory
+ * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
+ * operators can set aside a substantial portion of system memory exclusively
+ * for VMs. This memory is unavailable for general use by the rest of the
+ * system. Rather than having to scour the freelist, reap kmem caches, or put
+ * pressure on the ARC, bhyve guest memory allocations can quickly determine if
+ * there is adequate reservoir memory available. Since the pages stored in the
+ * reservoir are pre-zeroed, it can be immediately used when allocated to a
+ * guest. When the memory is returned to the reservoir, it is zeroed once more
+ * to avoid leaking any sensitive data from that guest.
+ *
+ *
+ * Transient Allocations
+ *
+ * While the explicit reservoir model may work well for some applications,
+ * others may want a more traditional model, where pages for guest memory
+ * objects are allocated on demand, rather than from a pool set aside from the
+ * system. In this case, the allocation can be made in "transient" mode, where
+ * the memory is allocated normally, even if there is free capacity in the
+ * reservoir. When use of the transient allocation is complete (the guest is
+ * halted and destroyed), the pages will be freed back to the system, rather
+ * than added back to the reservoir.
+ *
+ * From an implementation standpoint, transient allocations follow the same
+ * code paths as ones using the reservoir normally. Those allocations have a
+ * tag which marks them as transient, and used/free size tallies are maintained
+ * separately for normal and transient operations. When performing a transient
+ * allocation, that amount of memory is immediately added to the reservoir ,
+ * from which the allocation can be made. When freeing a transient allocation,
+ * a matching amount of memory is removed from the reservoir as part of the
+ * operation. This allows both allocation types to coexist without too much
+ * additional machinery.
+ *
+ *
+ * Administration
+ *
+ * Operators may increase, decrease, and query the the amount of memory
+ * allocated to the reservoir and from to VMs via ioctls against the vmmctl
+ * device. The total amount added to the reservoir is arbitrarily limited at
+ * this time by `vmmr_total_limit` which defaults to 80% of physmem. This is
+ * done to prevent the reservoir from inadvertently growing to a size where the
+ * system has inadequate memory to make forward progress. Memory may only be
+ * removed from the reservoir when it is free (not allocated by any guest VMs).
+ *
+ *
+ * Page Tracking
+ *
+ * The reservoir currently uses vnode association to keep track of pages under
+ * its control (either designated to the reservoir and free, or allocated to a
+ * guest VM object). This means using the existing VM system primitives for
+ * page_t instances being associated with a given (vnode, offset) tuple. It
+ * means that spans of pages, either free or allocated, need only to store a
+ * length (of the span) and an offset (into the vnode) in order to gain access
+ * to all of the underlying pages associated with that span. Associating the
+ * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
+ * properly tracked as KAS pages, but be excluded from normal dumps (unless the
+ * operator has chosen to dump all of RAM).
+ */
+
+#include <sys/types.h>
+#include <sys/mutex.h>
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/machparam.h>
+#include <sys/kmem.h>
+#include <sys/stddef.h>
+#include <sys/null.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <vm/seg_kmem.h>
+#include <vm/hat_i86.h>
+
+#include <sys/vmm_reservoir.h>
+#include <sys/vmm_dev.h>
+
+static kmutex_t vmmr_lock;
+
+static size_t vmmr_free_sz;
+static size_t vmmr_free_transient_sz;
+static size_t vmmr_adding_sz;
+static size_t vmmr_alloc_sz;
+static size_t vmmr_alloc_transient_sz;
+static size_t vmmr_empty_sz;
+
+static uintptr_t vmmr_empty_last;
+/* Upper limit for the size (free + allocated) of the reservoir */
+static size_t vmmr_total_limit;
+
+/* VA range allocated from the VMM arena for the mappings */
+static uintptr_t vmmr_va;
+static uintptr_t vmmr_va_sz;
+
+/* Pair of AVL trees to store set of spans ordered by addr and size */
+typedef struct vmmr_treepair {
+ avl_tree_t by_addr;
+ avl_tree_t by_size;
+} vmmr_treepair_t;
+
+/* Spans of free memory in the reservoir */
+static vmmr_treepair_t vmmr_free_tp;
+
+/* Spans of empty (not backed by memory) space in the reservoir */
+static vmmr_treepair_t vmmr_empty_tp;
+
+/* Regions of memory allocated from the reservoir */
+static list_t vmmr_alloc_regions;
+
+struct vmmr_span {
+ uintptr_t vs_addr;
+ size_t vs_size;
+ avl_node_t vs_by_addr;
+ avl_node_t vs_by_size;
+ uintptr_t vs_region_addr;
+};
+typedef struct vmmr_span vmmr_span_t;
+
+struct vmmr_region {
+ size_t vr_size;
+ avl_tree_t vr_spans;
+ list_node_t vr_node;
+ bool vr_transient;
+};
+
+static int
+vmmr_cmp_addr(const void *a, const void *b)
+{
+ const vmmr_span_t *sa = a;
+ const vmmr_span_t *sb = b;
+
+ if (sa->vs_addr == sb->vs_addr) {
+ return (0);
+ } else if (sa->vs_addr < sb->vs_addr) {
+ return (-1);
+ } else {
+ return (1);
+ }
+}
+
+static int
+vmmr_cmp_size(const void *a, const void *b)
+{
+ const vmmr_span_t *sa = a;
+ const vmmr_span_t *sb = b;
+
+ if (sa->vs_size == sb->vs_size) {
+ /*
+ * Since discontiguous spans could have the same size in a
+ * by-size tree, differentiate them (as required by AVL) by
+ * address so they can safely coexist while remaining sorted.
+ */
+ return (vmmr_cmp_addr(a, b));
+ } else if (sa->vs_size < sb->vs_size) {
+ return (-1);
+ } else {
+ return (1);
+ }
+}
+
+static int
+vmmr_cmp_region_addr(const void *a, const void *b)
+{
+ const vmmr_span_t *sa = a;
+ const vmmr_span_t *sb = b;
+
+ if (sa->vs_region_addr == sb->vs_region_addr) {
+ return (0);
+ } else if (sa->vs_region_addr < sb->vs_region_addr) {
+ return (-1);
+ } else {
+ return (1);
+ }
+}
+
+static void
+vmmr_tp_init(vmmr_treepair_t *tree)
+{
+ avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
+ offsetof(vmmr_span_t, vs_by_addr));
+ avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
+ offsetof(vmmr_span_t, vs_by_size));
+}
+
+static void
+vmmr_tp_destroy(vmmr_treepair_t *tree)
+{
+ void *vcp = NULL;
+ vmmr_span_t *span;
+
+ while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
+ /* Freeing spans will be done when tearing down by-size tree */
+ }
+ while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
+ kmem_free(span, sizeof (*span));
+ }
+ avl_destroy(&tree->by_addr);
+ avl_destroy(&tree->by_size);
+}
+
+/*
+ * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
+ * span(s). Such concatenation could result in the `to_add` span being freed,
+ * so the caller cannot use it after this returns.
+ */
+static void
+vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
+{
+ avl_tree_t *by_addr = &tree->by_addr;
+ avl_tree_t *by_size = &tree->by_size;
+ vmmr_span_t *node;
+ avl_index_t where;
+
+ /* This addr should not already exist in the treepair */
+ node = avl_find(by_addr, to_add, &where);
+ ASSERT3P(node, ==, NULL);
+
+ node = avl_nearest(by_addr, where, AVL_BEFORE);
+ if (node != NULL &&
+ (node->vs_addr + node->vs_size) == to_add->vs_addr) {
+ /* concat with preceeding item */
+ avl_remove(by_addr, node);
+ avl_remove(by_size, node);
+ node->vs_size += to_add->vs_size;
+ kmem_free(to_add, sizeof (*to_add));
+
+ /*
+ * Since this now-concatenated span could be adjacent one
+ * trailing it, fall through to perform that check.
+ */
+ to_add = node;
+ }
+
+ node = avl_nearest(by_addr, where, AVL_AFTER);
+ if (node != NULL &&
+ (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
+ /* concat with trailing item */
+ avl_remove(by_addr, node);
+ avl_remove(by_size, node);
+ node->vs_addr = to_add->vs_addr;
+ node->vs_size += to_add->vs_size;
+ avl_add(by_addr, node);
+ avl_add(by_size, node);
+
+ kmem_free(to_add, sizeof (*to_add));
+ return;
+ }
+
+ /* simply insert */
+ avl_add(by_addr, to_add);
+ avl_add(by_size, to_add);
+}
+
+/*
+ * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
+ * the exact target size is not present, but a larger one is. May return a span
+ * with a size smaller than the target if splitting is not an option.
+ */
+static vmmr_span_t *
+vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
+{
+ avl_tree_t *by_addr = &tree->by_addr;
+ avl_tree_t *by_size = &tree->by_size;
+ vmmr_span_t *span;
+ avl_index_t where;
+
+ ASSERT3U(target_sz, !=, 0);
+ ASSERT(!avl_is_empty(by_addr));
+ ASSERT(!avl_is_empty(by_size));
+
+ vmmr_span_t search = { .vs_size = target_sz };
+ span = avl_find(by_size, &search, &where);
+ if (span == NULL) {
+ /* Try for a larger span (instead of exact match) */
+ span = avl_nearest(by_size, where, AVL_AFTER);
+ if (span == NULL) {
+ /*
+ * Caller will need to collect several smaller spans in
+ * order to fulfill their request.
+ */
+ span = avl_nearest(by_size, where, AVL_BEFORE);
+ ASSERT3P(span, !=, NULL);
+ }
+ }
+
+ if (span->vs_size <= target_sz) {
+ avl_remove(by_size, span);
+ avl_remove(by_addr, span);
+
+ return (span);
+ } else {
+ /* Split off adequate chunk from larger span */
+ uintptr_t start = span->vs_addr + span->vs_size - target_sz;
+
+ avl_remove(by_size, span);
+ span->vs_size -= target_sz;
+ avl_add(by_size, span);
+
+ vmmr_span_t *split_span =
+ kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
+ split_span->vs_addr = start;
+ split_span->vs_size = target_sz;
+
+ return (split_span);
+ }
+}
+
+void
+vmmr_init()
+{
+ mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ /*
+ * `vmm_total_limit` represents the absolute maximum size of the VMM
+ * memory reservoir. It is meant to provide some measure of protection
+ * against an operator pushing the system into unrecoverable memory
+ * starvation through explicit or transient additions to the reservoir.
+ *
+ * There will be many situations where this limit would be inadequate to
+ * prevent kernel memory starvation in the face of certain operator
+ * actions. It is a balance to be struck between safety and allowing
+ * large systems to reach high utilization.
+ *
+ * The value is based off of pages_pp_maximum: "Number of currently
+ * available pages that cannot be 'locked'". It is sized as all of
+ * `physmem` less 120% of `pages_pp_maximum`.
+ */
+ vmmr_total_limit =
+ (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
+
+ vmmr_empty_last = 0;
+ vmmr_free_sz = 0;
+ vmmr_alloc_sz = 0;
+ vmmr_empty_sz = 0;
+ vmmr_adding_sz = 0;
+ vmmr_free_transient_sz = 0;
+ vmmr_alloc_transient_sz = 0;
+
+ vmmr_tp_init(&vmmr_free_tp);
+ vmmr_tp_init(&vmmr_empty_tp);
+
+ list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
+ offsetof(vmmr_region_t, vr_node));
+
+ /* Grab a chunk of VA for the reservoir */
+ vmmr_va_sz = physmem * PAGESIZE;
+ vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
+}
+
+void
+vmmr_fini()
+{
+ mutex_enter(&vmmr_lock);
+ VERIFY3U(vmmr_alloc_sz, ==, 0);
+ VERIFY3U(vmmr_free_sz, ==, 0);
+ VERIFY3U(vmmr_adding_sz, ==, 0);
+ VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
+ VERIFY3U(vmmr_free_transient_sz, ==, 0);
+ VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
+ VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
+ VERIFY(list_is_empty(&vmmr_alloc_regions));
+
+ vmmr_tp_destroy(&vmmr_free_tp);
+ vmmr_tp_destroy(&vmmr_empty_tp);
+ list_destroy(&vmmr_alloc_regions);
+
+ /* Release reservoir VA chunk */
+ vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
+ vmmr_va = 0;
+ vmmr_va_sz = 0;
+ vmmr_total_limit = 0;
+ vmmr_empty_last = 0;
+
+ mutex_exit(&vmmr_lock);
+ mutex_destroy(&vmmr_lock);
+}
+
+bool
+vmmr_is_empty()
+{
+ mutex_enter(&vmmr_lock);
+ bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
+ vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
+ mutex_exit(&vmmr_lock);
+ return (res);
+}
+
+int
+vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
+{
+ VERIFY3U(sz & PAGEOFFSET, ==, 0);
+
+ if (!transient) {
+ mutex_enter(&vmmr_lock);
+ if (sz > vmmr_free_sz) {
+ mutex_exit(&vmmr_lock);
+ return (ENOSPC);
+ }
+ } else {
+ int err;
+
+ err = vmmr_add(sz, true);
+ if (err != 0) {
+ return (err);
+ }
+ mutex_enter(&vmmr_lock);
+ VERIFY3U(vmmr_free_transient_sz, >=, sz);
+ }
+
+ vmmr_region_t *region;
+ region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
+ avl_create(&region->vr_spans, vmmr_cmp_region_addr,
+ sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
+ region->vr_size = sz;
+
+ size_t remain = sz;
+ uintptr_t map_at = 0;
+ while (remain > 0) {
+ vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
+
+ /*
+ * We have already ensured that adequate free memory is present
+ * in the reservoir for this allocation.
+ */
+ VERIFY3P(span, !=, NULL);
+ ASSERT3U(span->vs_size, <=, remain);
+
+ span->vs_region_addr = map_at;
+ avl_add(&region->vr_spans, span);
+ map_at += span->vs_size;
+ remain -= span->vs_size;
+ }
+
+ if (!transient) {
+ vmmr_free_sz -= sz;
+ vmmr_alloc_sz += sz;
+ } else {
+ vmmr_free_transient_sz -= sz;
+ vmmr_alloc_transient_sz += sz;
+ region->vr_transient = true;
+ }
+ list_insert_tail(&vmmr_alloc_regions, region);
+ mutex_exit(&vmmr_lock);
+
+ *resp = region;
+ return (0);
+}
+
+void *
+vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
+{
+ /* just use KPM region for now */
+ return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
+}
+
+pfn_t
+vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
+{
+ VERIFY3U(off & PAGEOFFSET, ==, 0);
+ VERIFY3U(off, <, region->vr_size);
+
+ vmmr_span_t search = {
+ .vs_region_addr = off
+ };
+ avl_index_t where;
+ vmmr_span_t *span = avl_find(&region->vr_spans, &search, &where);
+
+ if (span == NULL) {
+ span = avl_nearest(&region->vr_spans, where, AVL_BEFORE);
+ ASSERT3P(span, !=, NULL);
+ }
+ uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
+ page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
+ VERIFY(pp != NULL);
+ return (pp->p_pagenum);
+}
+
+void
+vmmr_free(vmmr_region_t *region)
+{
+ mutex_enter(&vmmr_lock);
+ if (!region->vr_transient) {
+ VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
+ } else {
+ VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
+ }
+ list_remove(&vmmr_alloc_regions, region);
+ mutex_exit(&vmmr_lock);
+
+ /* Zero the contents */
+ for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
+ bzero(vmmr_region_mem_at(region, off), PAGESIZE);
+ }
+
+ mutex_enter(&vmmr_lock);
+
+ /* Put the contained span(s) back in the free pool */
+ void *cookie = NULL;
+ vmmr_span_t *span;
+ while ((span = avl_destroy_nodes(&region->vr_spans, &cookie)) != NULL) {
+ span->vs_region_addr = 0;
+ vmmr_tp_insert_concat(span, &vmmr_free_tp);
+ }
+ avl_destroy(&region->vr_spans);
+ if (!region->vr_transient) {
+ vmmr_free_sz += region->vr_size;
+ vmmr_alloc_sz -= region->vr_size;
+ } else {
+ vmmr_free_transient_sz += region->vr_size;
+ vmmr_alloc_transient_sz -= region->vr_size;
+ }
+ mutex_exit(&vmmr_lock);
+
+ if (region->vr_transient) {
+ vmmr_remove(region->vr_size, true);
+ }
+ kmem_free(region, sizeof (*region));
+}
+
+static void
+vmmr_destroy_pages(vmmr_span_t *span)
+{
+ const uintptr_t end = span->vs_addr + span->vs_size;
+ struct vnode *vp = &kvps[KV_VVP];
+ for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
+ page_t *pp;
+
+ /* Page-free logic cribbed from segkmem_xfree(): */
+ pp = page_find(vp, (u_offset_t)pos);
+ VERIFY(pp != NULL);
+ if (!page_tryupgrade(pp)) {
+ /*
+ * Some other thread has a sharelock. Wait for
+ * it to drop the lock so we can free this page.
+ */
+ page_unlock(pp);
+ pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
+ }
+
+ /*
+ * Clear p_lckcnt so page_destroy() doesn't update availrmem.
+ * That will be taken care of later via page_unresv().
+ */
+ pp->p_lckcnt = 0;
+ page_destroy(pp, 0);
+ }
+}
+
+static int
+vmmr_alloc_pages(const vmmr_span_t *span)
+{
+ struct seg kseg = {
+ .s_as = &kas
+ };
+ struct vnode *vp = &kvps[KV_VVP];
+
+ const uintptr_t end = span->vs_addr + span->vs_size;
+ for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
+ page_t *pp;
+
+ pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
+ PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
+
+ if (pp == NULL) {
+ /* Destroy any already-created pages */
+ if (pos != span->vs_addr) {
+ vmmr_span_t destroy_span = {
+ .vs_addr = span->vs_addr,
+ .vs_size = pos - span->vs_addr,
+ };
+
+ vmmr_destroy_pages(&destroy_span);
+ }
+ return (ENOMEM);
+ }
+
+ /* mimic page state from segkmem */
+ ASSERT(PAGE_EXCL(pp));
+ page_io_unlock(pp);
+ pp->p_lckcnt = 1;
+ page_downgrade(pp);
+
+ /* pre-zero the page */
+ bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
+ }
+
+ return (0);
+}
+
+static int
+vmmr_resv_wait()
+{
+ if (delay_sig(hz >> 2) != 0) {
+ /* bail due to interruption */
+ return (0);
+ }
+ return (1);
+}
+
+static void
+vmmr_remove_raw(size_t sz)
+{
+ VERIFY3U(sz & PAGEOFFSET, ==, 0);
+ VERIFY(MUTEX_HELD(&vmmr_lock));
+
+ size_t remain = sz;
+ while (remain > 0) {
+ vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
+
+ /*
+ * The caller must ensure that at least `sz` amount is present
+ * in the free treepair.
+ */
+ VERIFY3P(span, !=, NULL);
+ ASSERT3U(span->vs_size, <=, remain);
+
+ /* TODO: perhaps arrange to destroy pages outside the lock? */
+ vmmr_destroy_pages(span);
+
+ remain -= span->vs_size;
+ vmmr_tp_insert_concat(span, &vmmr_empty_tp);
+ }
+
+ vmmr_empty_sz += sz;
+}
+
+int
+vmmr_add(size_t sz, bool transient)
+{
+ VERIFY3U(sz & PAGEOFFSET, ==, 0);
+
+ mutex_enter(&vmmr_lock);
+ /*
+ * Make sure that the amount added is not going to breach the limits
+ * we've chosen
+ */
+ const size_t current_total =
+ vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
+ vmmr_alloc_transient_sz + vmmr_free_transient_sz;
+ if ((current_total + sz) < current_total) {
+ mutex_exit(&vmmr_lock);
+ return (EOVERFLOW);
+ }
+ if ((current_total + sz) > vmmr_total_limit) {
+ mutex_exit(&vmmr_lock);
+ return (ENOSPC);
+ }
+ vmmr_adding_sz += sz;
+ mutex_exit(&vmmr_lock);
+
+ /* Wait for enough pages to become available */
+ if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
+ mutex_enter(&vmmr_lock);
+ vmmr_adding_sz -= sz;
+ mutex_exit(&vmmr_lock);
+
+ return (EINTR);
+ }
+
+ mutex_enter(&vmmr_lock);
+ size_t added = 0;
+ size_t remain = sz;
+ while (added < sz) {
+ vmmr_span_t *span = NULL;
+
+ if (vmmr_empty_sz > 0) {
+ span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
+
+ vmmr_empty_sz -= span->vs_size;
+ } else {
+ /*
+ * No empty space to fill with new pages, so just tack
+ * it on at the end instead.
+ */
+ span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
+ span->vs_addr = vmmr_empty_last;
+ span->vs_size = remain;
+ vmmr_empty_last += remain;
+ }
+ VERIFY3P(span, !=, NULL);
+
+
+ /* Allocate the actual pages to back this span */
+ mutex_exit(&vmmr_lock);
+ int err = vmmr_alloc_pages(span);
+ mutex_enter(&vmmr_lock);
+
+ /*
+ * If an error is encountered during page allocation for the
+ * span, unwind any progress made by the addition request.
+ */
+ if (err != 0) {
+ /*
+ * Without pages allocated to this span, it is now
+ * tracked as empty.
+ */
+ vmmr_empty_sz += span->vs_size;
+ vmmr_tp_insert_concat(span, &vmmr_empty_tp);
+
+ if (added != 0) {
+ vmmr_remove_raw(added);
+ }
+
+ vmmr_adding_sz -= sz;
+ mutex_exit(&vmmr_lock);
+
+ page_unresv(sz >> PAGESHIFT);
+ return (err);
+ }
+
+ /*
+ * The allocated-page-bearing span is placed in the "free"
+ * treepair now, but is not officially exposed for consumption
+ * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
+ *
+ * This allows us to unwind the allocation in case of a failure
+ * without the risk of the freshly added span(s) being snapped
+ * up by a consumer already.
+ */
+ added += span->vs_size;
+ remain -= span->vs_size;
+ vmmr_tp_insert_concat(span, &vmmr_free_tp);
+ }
+
+ /* Make the added memory usable by exposing it to the size accounting */
+ if (!transient) {
+ vmmr_free_sz += added;
+ } else {
+ vmmr_free_transient_sz += added;
+ }
+ ASSERT3U(added, ==, sz);
+ vmmr_adding_sz -= added;
+
+ mutex_exit(&vmmr_lock);
+ return (0);
+}
+
+int
+vmmr_remove(size_t sz, bool transient)
+{
+ VERIFY3U(sz & PAGEOFFSET, ==, 0);
+
+ mutex_enter(&vmmr_lock);
+ if ((!transient && sz > vmmr_free_sz) ||
+ (transient && sz > vmmr_free_transient_sz)) {
+ mutex_exit(&vmmr_lock);
+ return (ENOSPC);
+ }
+
+ vmmr_remove_raw(sz);
+
+ if (!transient) {
+ vmmr_free_sz -= sz;
+ } else {
+ vmmr_free_transient_sz -= sz;
+ }
+ mutex_exit(&vmmr_lock);
+ page_unresv(sz >> PAGESHIFT);
+ return (0);
+}
+
+int
+vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
+{
+ switch (cmd) {
+ case VMM_RESV_QUERY: {
+ struct vmm_resv_query res;
+ void *datap = (void *)(uintptr_t)arg;
+
+ /* For now, anyone in GZ can query */
+ if (crgetzoneid(cr) != GLOBAL_ZONEID) {
+ return (EPERM);
+ }
+ mutex_enter(&vmmr_lock);
+ res.vrq_free_sz = vmmr_free_sz;
+ res.vrq_alloc_sz = vmmr_alloc_sz;
+ res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
+ res.vrq_limit = vmmr_total_limit;
+ mutex_exit(&vmmr_lock);
+ if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
+ return (EFAULT);
+ }
+ break;
+ }
+ case VMM_RESV_ADD: {
+ if (secpolicy_sys_config(cr, B_FALSE) != 0) {
+ return (EPERM);
+ }
+ return (vmmr_add((size_t)arg, false));
+ }
+ case VMM_RESV_REMOVE: {
+ if (secpolicy_sys_config(cr, B_FALSE) != 0) {
+ return (EPERM);
+ }
+ return (vmmr_remove((size_t)arg, false));
+ }
+ default:
+ return (ENOTTY);
+ }
+ return (0);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
index d5f4b3883b..ef366ddaff 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -45,6 +45,7 @@
#include <sys/vmm_impl.h>
#include <sys/vmm_drv.h>
#include <sys/vmm_vm.h>
+#include <sys/vmm_reservoir.h>
#include <vm/seg_dev.h>
@@ -1506,13 +1507,22 @@ vmm_hma_release(void)
}
static int
-vmmdev_do_vm_create(char *name, cred_t *cr)
+vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
{
vmm_softc_t *sc = NULL;
minor_t minor;
int error = ENOMEM;
+ size_t len;
+ const char *name = req->name;
- if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
+ len = strnlen(name, VM_MAX_NAMELEN);
+ if (len == 0) {
+ return (EINVAL);
+ }
+ if (len >= VM_MAX_NAMELEN) {
+ return (ENAMETOOLONG);
+ }
+ if (strchr(name, '/') != NULL) {
return (EINVAL);
}
@@ -1555,7 +1565,7 @@ vmmdev_do_vm_create(char *name, cred_t *cr)
goto fail;
}
- error = vm_create(name, &sc->vmm_vm);
+ error = vm_create(req->name, req->flags, &sc->vmm_vm);
if (error == 0) {
/* Complete VM intialization and report success. */
(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
@@ -1938,7 +1948,7 @@ vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
/* ARGSUSED */
static int
-vmmdev_do_vm_destroy(const char *name, cred_t *cr)
+vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
{
boolean_t hma_release = B_FALSE;
vmm_softc_t *sc;
@@ -1949,7 +1959,7 @@ vmmdev_do_vm_destroy(const char *name, cred_t *cr)
mutex_enter(&vmm_mtx);
- if ((sc = vmm_lookup(name)) == NULL) {
+ if ((sc = vmm_lookup(req->name)) == NULL) {
mutex_exit(&vmm_mtx);
return (ENOENT);
}
@@ -2193,6 +2203,47 @@ vmm_is_supported(intptr_t arg)
}
static int
+vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
+{
+ void *argp = (void *)arg;
+
+ switch (cmd) {
+ case VMM_CREATE_VM: {
+ struct vm_create_req req;
+
+ if ((md & FWRITE) == 0) {
+ return (EPERM);
+ }
+ if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
+ return (EFAULT);
+ }
+ return (vmmdev_do_vm_create(&req, cr));
+ }
+ case VMM_DESTROY_VM: {
+ struct vm_destroy_req req;
+
+ if ((md & FWRITE) == 0) {
+ return (EPERM);
+ }
+ if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
+ return (EFAULT);
+ }
+ return (vmmdev_do_vm_destroy(&req, cr));
+ }
+ case VMM_VM_SUPPORTED:
+ return (vmm_is_supported(arg));
+ case VMM_RESV_QUERY:
+ case VMM_RESV_ADD:
+ case VMM_RESV_REMOVE:
+ return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
+ default:
+ break;
+ }
+ /* No other actions are legal on ctl device */
+ return (ENOTTY);
+}
+
+static int
vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
int *rvalp)
{
@@ -2207,36 +2258,7 @@ vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
minor = getminor(dev);
if (minor == VMM_CTL_MINOR) {
- void *argp = (void *)arg;
- char name[VM_MAX_NAMELEN] = { 0 };
- size_t len = 0;
-
- if ((mode & FKIOCTL) != 0) {
- len = strlcpy(name, argp, sizeof (name));
- } else {
- if (copyinstr(argp, name, sizeof (name), &len) != 0) {
- return (EFAULT);
- }
- }
- if (len >= VM_MAX_NAMELEN) {
- return (ENAMETOOLONG);
- }
-
- switch (cmd) {
- case VMM_CREATE_VM:
- if ((mode & FWRITE) == 0)
- return (EPERM);
- return (vmmdev_do_vm_create(name, credp));
- case VMM_DESTROY_VM:
- if ((mode & FWRITE) == 0)
- return (EPERM);
- return (vmmdev_do_vm_destroy(name, credp));
- case VMM_VM_SUPPORTED:
- return (vmm_is_supported(arg));
- default:
- /* No other actions are legal on ctl device */
- return (ENOTTY);
- }
+ return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
}
sc = ddi_get_soft_state(vmm_statep, minor);
@@ -2422,7 +2444,6 @@ vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
}
vmm_sol_glue_init();
- vmm_arena_init();
/*
* Perform temporary HMA registration to determine if the system
@@ -2462,7 +2483,6 @@ fail:
if (reg != NULL) {
hma_unregister(reg);
}
- vmm_arena_fini();
vmm_sol_glue_cleanup();
mutex_exit(&vmmdev_mtx);
return (DDI_FAILURE);
@@ -2494,6 +2514,11 @@ vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
}
mutex_exit(&vmm_mtx);
+ if (!vmmr_is_empty()) {
+ mutex_exit(&vmmdev_mtx);
+ return (DDI_FAILURE);
+ }
+
VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
mutex_exit(&vmmdev_mtx);
@@ -2507,7 +2532,6 @@ vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
VERIFY0(vmm_mod_unload());
VERIFY3U(vmmdev_hma_reg, ==, NULL);
- vmm_arena_fini();
vmm_sol_glue_cleanup();
mutex_exit(&vmmdev_mtx);
@@ -2579,11 +2603,13 @@ _init(void)
}
vmm_zsd_init();
+ vmmr_init();
error = mod_install(&modlinkage);
if (error) {
ddi_soft_state_fini(&vmm_statep);
vmm_zsd_fini();
+ vmmr_fini();
}
return (error);
@@ -2600,6 +2626,7 @@ _fini(void)
}
vmm_zsd_fini();
+ vmmr_fini();
ddi_soft_state_fini(&vmm_statep);
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
index 720af54200..bd1f1890d4 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
@@ -32,11 +32,12 @@
#include <vm/hat_i86.h>
#include <vm/seg_vn.h>
#include <vm/seg_kmem.h>
-#include <vm/seg_vmm.h>
#include <machine/vm.h>
#include <sys/vmm_gpt.h>
#include <sys/vmm_vm.h>
+#include <sys/seg_vmm.h>
+#include <sys/vmm_reservoir.h>
#define PMAP_TO_VMMAP(pm) ((vm_map_t) \
((caddr_t)(pm) - offsetof(struct vmspace, vms_pmap)))
@@ -65,38 +66,6 @@ static vmspace_mapping_t *vm_mapping_find(struct vmspace *, uintptr_t, size_t,
boolean_t);
static void vm_mapping_remove(struct vmspace *, vmspace_mapping_t *);
-static vmem_t *vmm_alloc_arena = NULL;
-
-static void *
-vmm_arena_alloc(vmem_t *vmp, size_t size, int vmflag)
-{
- return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
- segkmem_page_create, &kvps[KV_VVP]));
-}
-
-static void
-vmm_arena_free(vmem_t *vmp, void *inaddr, size_t size)
-{
- segkmem_xfree(vmp, inaddr, size, &kvps[KV_VVP], NULL);
-}
-
-void
-vmm_arena_init(void)
-{
- vmm_alloc_arena = vmem_create("vmm_alloc_arena", NULL, 0, 1024 * 1024,
- vmm_arena_alloc, vmm_arena_free, kvmm_arena, 0, VM_SLEEP);
-
- ASSERT(vmm_alloc_arena != NULL);
-}
-
-void
-vmm_arena_fini(void)
-{
- VERIFY(vmem_size(vmm_alloc_arena, VMEM_ALLOC) == 0);
- vmem_destroy(vmm_alloc_arena);
- vmm_alloc_arena = NULL;
-}
-
struct vmspace *
vmspace_alloc(vm_offset_t start, vm_offset_t end, pmap_pinit_t pinit)
{
@@ -164,8 +133,9 @@ vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size)
switch (vmo->vmo_type) {
case OBJT_DEFAULT:
- result = (void *)((uintptr_t)vmo->vmo_data +
- VMSM_OFFSET(vmsm, addr));
+ result = vmmr_region_mem_at(
+ (vmmr_region_t *)vmo->vmo_data,
+ VMSM_OFFSET(vmsm, addr) & PAGEMASK);
break;
default:
break;
@@ -344,39 +314,23 @@ vm_object_pager_none(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
}
static pfn_t
-vm_object_pager_heap(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
+vm_object_pager_reservoir(vm_object_t vmo, uintptr_t off, pfn_t *lpfn,
+ uint_t *lvl)
{
- const uintptr_t kaddr = ALIGN2PAGE((uintptr_t)vmo->vmo_data + off);
- uint_t idx, level;
- htable_t *ht;
- x86pte_t pte;
- pfn_t top_pfn, pfn;
+ vmmr_region_t *region;
+ pfn_t pfn;
ASSERT(vmo->vmo_type == OBJT_DEFAULT);
- ASSERT(off < vmo->vmo_size);
- ht = htable_getpage(kas.a_hat, kaddr, &idx);
- if (ht == NULL) {
- return (PFN_INVALID);
- }
- pte = x86pte_get(ht, idx);
- if (!PTE_ISPAGE(pte, ht->ht_level)) {
- htable_release(ht);
- return (PFN_INVALID);
- }
-
- pfn = top_pfn = PTE2PFN(pte, ht->ht_level);
- level = ht->ht_level;
- if (ht->ht_level > 0) {
- pfn += mmu_btop(kaddr & LEVEL_OFFSET((uint_t)ht->ht_level));
- }
- htable_release(ht);
+ region = vmo->vmo_data;
+ pfn = vmmr_region_pfn_at(region, off & PAGEMASK);
+ /* TODO: handle large pages */
if (lpfn != NULL) {
- *lpfn = top_pfn;
+ *lpfn = pfn;
}
if (lvl != NULL) {
- *lvl = level;
+ *lvl = 0;
}
return (pfn);
}
@@ -419,41 +373,8 @@ vm_object_pager_sg(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
return (pfn);
}
-static void
-vm_reserve_pages(size_t npages)
-{
- uint_t retries = 60;
- int rc;
-
- mutex_enter(&freemem_lock);
- if (availrmem < npages) {
- mutex_exit(&freemem_lock);
-
- /*
- * Set needfree and wait for the ZFS ARC reap thread to free up
- * some memory.
- */
- page_needfree(npages);
-
- mutex_enter(&freemem_lock);
- while ((availrmem < npages) && retries-- > 0) {
- mutex_exit(&freemem_lock);
- rc = delay_sig(drv_usectohz(1 * MICROSEC));
- mutex_enter(&freemem_lock);
-
- if (rc == EINTR)
- break;
- }
- mutex_exit(&freemem_lock);
-
- page_needfree(-npages);
- } else {
- mutex_exit(&freemem_lock);
- }
-}
-
vm_object_t
-vm_object_allocate(objtype_t type, vm_pindex_t psize)
+vm_object_allocate(objtype_t type, vm_pindex_t psize, bool transient)
{
vm_object_t vmo;
const size_t size = ptob((size_t)psize);
@@ -468,17 +389,19 @@ vm_object_allocate(objtype_t type, vm_pindex_t psize)
switch (type) {
case OBJT_DEFAULT: {
- vm_reserve_pages(psize);
- /* XXXJOY: opt-in to larger pages? */
- vmo->vmo_data = vmem_alloc(vmm_alloc_arena, size, KM_NOSLEEP);
- if (vmo->vmo_data == NULL) {
+ /* TODO: opt-in to larger pages? */
+ int err;
+ vmmr_region_t *region = NULL;
+
+ err = vmmr_alloc(size, transient, &region);
+ if (err != 0) {
mutex_destroy(&vmo->vmo_lock);
kmem_free(vmo, sizeof (*vmo));
return (NULL);
}
- bzero(vmo->vmo_data, size);
- vmo->vmo_pager = vm_object_pager_heap;
+ vmo->vmo_data = region;
+ vmo->vmo_pager = vm_object_pager_reservoir;
}
break;
case OBJT_SG:
@@ -505,7 +428,7 @@ vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size,
VERIFY(type == OBJT_SG);
VERIFY(off == 0);
- vmo = vm_object_allocate(type, size);
+ vmo = vm_object_allocate(type, size, false);
vmo->vmo_data = sg;
mutex_enter(&sg->sg_lock);
@@ -529,7 +452,7 @@ vm_object_deallocate(vm_object_t vmo)
switch (vmo->vmo_type) {
case OBJT_DEFAULT:
- vmem_free(vmm_alloc_arena, vmo->vmo_data, vmo->vmo_size);
+ vmmr_free((vmmr_region_t *)vmo->vmo_data);
break;
case OBJT_SG:
sglist_free((struct sglist *)vmo->vmo_data);
@@ -574,6 +497,17 @@ vm_object_reference(vm_object_t vmo)
VERIFY3U(ref, !=, 0);
}
+pfn_t
+vm_object_pfn(vm_object_t vmo, uintptr_t off)
+{
+ /* This is expected to be used only on reservoir-backed memory */
+ if (vmo->vmo_type != OBJT_DEFAULT) {
+ return (PFN_INVALID);
+ }
+
+ return (vmo->vmo_pager(vmo, off, NULL, NULL));
+}
+
static vmspace_mapping_t *
vm_mapping_find(struct vmspace *vms, uintptr_t addr, size_t size,
boolean_t no_lock)
@@ -912,11 +846,9 @@ vm_segmap_obj(vm_object_t vmo, off_t map_off, size_t size, struct as *as,
if (err == 0) {
segvmm_crargs_t svma;
- svma.kaddr = (caddr_t)vmo->vmo_data + map_off;
+ svma.obj = vmo;
+ svma.offset = map_off;
svma.prot = prot;
- svma.cookie = vmo;
- svma.hold = (segvmm_holdfn_t)vm_object_reference;
- svma.rele = (segvmm_relefn_t)vm_object_deallocate;
err = as_map(as, *addrp, size, segvmm_create, &svma);
}
@@ -969,11 +901,9 @@ vm_segmap_space(struct vmspace *vms, off_t off, struct as *as, caddr_t *addrp,
VERIFY(mapoff < vmo->vmo_size);
VERIFY((mapoff + size) <= vmo->vmo_size);
- svma.kaddr = (void *)((uintptr_t)vmo->vmo_data + mapoff);
+ svma.obj = vmo;
+ svma.offset = mapoff;
svma.prot = prot;
- svma.cookie = vmo;
- svma.hold = (segvmm_holdfn_t)vm_object_reference;
- svma.rele = (segvmm_relefn_t)vm_object_deallocate;
err = as_map(as, *addrp, len, segvmm_create, &svma);
}
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
index 5b3e7f9b10..e58d63761e 100644
--- a/usr/src/uts/i86pc/sys/vmm.h
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -39,7 +39,7 @@
*
* Copyright 2015 Pluribus Networks Inc.
* Copyright 2019 Joyent, Inc.
- * Copyright 2020 Oxide Computer Company
+ * Copyright 2021 Oxide Computer Company
*/
#ifndef _VMM_H_
@@ -124,20 +124,12 @@ enum x2apic_state {
/*
* illumos doesn't have a limitation based on SPECNAMELEN like FreeBSD does.
- * Instead of picking an arbitrary value we will just rely on the same
- * calculation that's made below. If this calculation ever changes we need to
- * update the the VM_MAX_NAMELEN mapping in the bhyve brand's boot.c file.
+ * To simplify structure definitions, an arbitrary limit has been chosen.
+ * This same limit is used for memory segment names
*/
-#define VM_MAX_PREFIXLEN 10
-#define VM_MAX_SUFFIXLEN 15
-#define VM_MIN_NAMELEN 6
-#define VM_MAX_NAMELEN \
- (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1)
-
-#ifdef _KERNEL
-CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN);
-#endif
+#define VM_MAX_NAMELEN 128
+#define VM_MAX_SEG_NAMELEN 128
#define VM_MAXCPU 32 /* maximum virtual cpus */
@@ -389,4 +381,12 @@ struct vm_entry {
int vm_restart_instruction(void *vm, int vcpuid);
+enum vm_create_flags {
+ /*
+ * Allocate guest memory segments from existing reservoir capacity,
+ * rather than attempting to create transient allocations.
+ */
+ VCF_RESERVOIR_MEM = (1 << 0),
+};
+
#endif /* _VMM_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h
index 15c64355c4..f371ad1266 100644
--- a/usr/src/uts/i86pc/sys/vmm_dev.h
+++ b/usr/src/uts/i86pc/sys/vmm_dev.h
@@ -39,6 +39,7 @@
*
* Copyright 2015 Pluribus Networks Inc.
* Copyright 2019 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
*/
#ifndef _VMM_DEV_H_
@@ -46,6 +47,19 @@
#include <machine/vmm.h>
+#include <sys/param.h>
+#include <sys/cpuset.h>
+
+struct vm_create_req {
+ char name[VM_MAX_NAMELEN];
+ uint64_t flags;
+};
+
+
+struct vm_destroy_req {
+ char name[VM_MAX_NAMELEN];
+};
+
struct vm_memmap {
vm_paddr_t gpa;
int segid; /* memory segment */
@@ -66,7 +80,7 @@ struct vm_munmap {
struct vm_memseg {
int segid;
size_t len;
- char name[SPECNAMELEN + 1];
+ char name[VM_MAX_SEG_NAMELEN];
};
struct vm_register {
@@ -282,6 +296,13 @@ struct vm_run_state {
uint8_t _pad[3];
};
+struct vmm_resv_query {
+ size_t vrq_free_sz;
+ size_t vrq_alloc_sz;
+ size_t vrq_alloc_transient_sz;
+ size_t vrq_limit;
+};
+
#define VMMCTL_IOC_BASE (('V' << 16) | ('M' << 8))
#define VMM_IOC_BASE (('v' << 16) | ('m' << 8))
#define VMM_LOCK_IOC_BASE (('v' << 16) | ('l' << 8))
@@ -292,6 +313,10 @@ struct vm_run_state {
#define VMM_DESTROY_VM (VMMCTL_IOC_BASE | 0x02)
#define VMM_VM_SUPPORTED (VMMCTL_IOC_BASE | 0x03)
+#define VMM_RESV_QUERY (VMMCTL_IOC_BASE | 0x10)
+#define VMM_RESV_ADD (VMMCTL_IOC_BASE | 0x11)
+#define VMM_RESV_REMOVE (VMMCTL_IOC_BASE | 0x12)
+
/* Operations performed in the context of a given vCPU */
#define VM_RUN (VMM_CPU_IOC_BASE | 0x01)
#define VM_SET_REGISTER (VMM_CPU_IOC_BASE | 0x02)
diff --git a/usr/src/uts/i86pc/vm/seg_vmm.h b/usr/src/uts/i86pc/vm/seg_vmm.h
deleted file mode 100644
index f5b95c6a27..0000000000
--- a/usr/src/uts/i86pc/vm/seg_vmm.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2018 Joyent, Inc.
- */
-
-#ifndef _VM_SEG_VMM_H
-#define _VM_SEG_VMM_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct segvmm_crargs {
- caddr_t kaddr;
- uchar_t prot; /* protection */
- void *cookie; /* opaque resource backing memory */
- void (*hold)(void *); /* add reference to cookie */
- void (*rele)(void *); /* release reference to cookie */
-} segvmm_crargs_t;
-
-typedef void (*segvmm_holdfn_t)(void *);
-typedef void (*segvmm_relefn_t)(void *);
-
-typedef struct segvmm_data {
- krwlock_t svmd_lock;
- uintptr_t svmd_kaddr;
- uchar_t svmd_prot;
- void *svmd_cookie;
- segvmm_holdfn_t svmd_hold;
- segvmm_relefn_t svmd_rele;
- size_t svmd_softlockcnt;
-} segvmm_data_t;
-
-extern int segvmm_create(struct seg **, void *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _VM_SEG_VMM_H */