diff options
author | Patrick Mooney <pmooney@pfmooney.com> | 2021-05-28 21:07:11 +0000 |
---|---|---|
committer | Patrick Mooney <pmooney@oxide.computer> | 2021-07-27 19:26:22 +0000 |
commit | b57f5d3e6a2df8d435e606797cf3934811848343 (patch) | |
tree | 31d0b366057848a88837b15524905a703c3bdf9c | |
parent | ed1e93792d7c9ea04a0cb44cffe34c24c135b002 (diff) | |
download | illumos-joyent-b57f5d3e6a2df8d435e606797cf3934811848343.tar.gz |
13833 want bhyve memory reservoir
13822 bhyve memory should exert memory pressure
13834 want extensible page_resv
13821 vmmctl ioctls should have more structure
Reviewed by: Andy Fiddaman <andy@omnios.org>
Reviewed by: Jason King <jason.brian.king@gmail.com>
Reviewed by: Dan Cross <cross@oxidecomputer.com>
Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Reviewed by: Mike Zeller <mike.zeller@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
24 files changed, 1441 insertions, 360 deletions
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile index 22afdf737a..8159ad677b 100644 --- a/usr/src/cmd/Makefile +++ b/usr/src/cmd/Makefile @@ -497,6 +497,7 @@ i386_SUBDIRS= \ nvmeadm \ pptadm \ rdmsr \ + rsrvrctl \ rtc \ ucodeadm \ xhci \ diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c index c777529858..b127c7cadc 100644 --- a/usr/src/cmd/bhyve/bhyverun.c +++ b/usr/src/cmd/bhyve/bhyverun.c @@ -39,7 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2018 Joyent, Inc. - * Copyright 2020 Oxide Computer Company + * Copyright 2021 Oxide Computer Company */ #include <sys/cdefs.h> @@ -1253,8 +1253,15 @@ do_open(const char *vmname) if (lpc_bootrom()) romboot = true; - +#ifndef __FreeBSD__ + uint64_t create_flags = 0; + if (get_config_bool_default("memory.use_reservoir", false)) { + create_flags |= VCF_RESERVOIR_MEM; + } + error = vm_create(vmname, create_flags); +#else error = vm_create(vmname); +#endif /* __FreeBSD__ */ if (error) { if (errno == EEXIST) { if (romboot) { diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c index 313a1a37f4..4fc6ddc251 100644 --- a/usr/src/cmd/bhyvectl/bhyvectl.c +++ b/usr/src/cmd/bhyvectl/bhyvectl.c @@ -39,7 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2019 Joyent, Inc. - * Copyright 2020 Oxide Computer Company + * Copyright 2021 Oxide Computer Company */ #include <sys/cdefs.h> @@ -1953,8 +1953,13 @@ main(int argc, char *argv[]) error = 0; +#ifndef __FreeBSD__ + if (!error && create) + error = vm_create(vmname, 0); +# else if (!error && create) error = vm_create(vmname); +#endif /* __FreeBSD__ */ if (!error) { ctx = vm_open(vmname); diff --git a/usr/src/cmd/rsrvrctl/Makefile b/usr/src/cmd/rsrvrctl/Makefile new file mode 100644 index 0000000000..f51df92730 --- /dev/null +++ b/usr/src/cmd/rsrvrctl/Makefile @@ -0,0 +1,48 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2021 Oxide Computer Company +# + +PROG = rsrvrctl + +include ../Makefile.cmd +include ../Makefile.cmd.64 +include ../Makefile.ctf + +SRCS = rsrvrctl.c +OBJS = $(SRCS:.c=.o) + +CLEANFILES = $(PROG) +CLOBBERFILES += $(ROOTUSRSBINPROG) + +.KEEP_STATE: + +CFLAGS += $(CCVERBOSE) +CPPFLAGS = -I$(COMPAT)/bhyve -I$(CONTRIB)/bhyve \ + -I$(COMPAT)/bhyve/amd64 -I$(CONTRIB)/bhyve/amd64 \ + $(CPPFLAGS.master) \ + -I$(SRC)/uts/i86pc/io/vmm \ + -I$(SRC)/uts/i86pc + +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) + $(POST_PROCESS) + +install: all $(ROOTLIBPROG) + +clean: + $(RM) $(OBJS) $(CLEANFILES) + +include ../Makefile.targ diff --git a/usr/src/cmd/rsrvrctl/rsrvrctl.c b/usr/src/cmd/rsrvrctl/rsrvrctl.c new file mode 100644 index 0000000000..e189520a1c --- /dev/null +++ b/usr/src/cmd/rsrvrctl/rsrvrctl.c @@ -0,0 +1,164 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2021 Oxide Computer Company + */ + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/types.h> + +#include <sys/vmm_dev.h> + +static void +usage(const char *pname) +{ + fprintf(stderr, + "Usage: %s [-a add] [-r remove] [-q]\n" + "\t-a <SZ> add SZ MiB to the reservoir\n" + "\t-r <SZ> remove SZ MiB from the reservoir\n" + "\t-q query reservoir state\n", pname); +} + +static bool +parse_size(const char *arg, size_t *resp) +{ + size_t res; + + errno = 0; + res = strtoul(arg, NULL, 0); + if (errno != 0) { + return (false); + } + + *resp = (res * 1024 * 1024); + return (true); +} + +static void +do_add(int fd, size_t sz) +{ + int res; + + res = ioctl(fd, VMM_RESV_ADD, sz); + if (res != 0) { + perror("Could not add to reservoir"); + exit(EXIT_FAILURE); + } +} + +static void +do_remove(int fd, size_t sz) +{ + int res; + + res = ioctl(fd, VMM_RESV_REMOVE, sz); + if (res != 0) { + perror("Could not remove from reservoir"); + exit(EXIT_FAILURE); + } +} + +static void +do_query(int fd) +{ + struct vmm_resv_query data; + int res; + + res = ioctl(fd, VMM_RESV_QUERY, &data); + if (res != 0) { + perror("Could not query reservoir info"); + return; + } + + printf("Free KiB:\t%llu\n" + "Allocated KiB:\t%llu\n" + "Transient Allocated KiB:\t%llu\n" + "Size limit KiB:\t%llu\n", + data.vrq_free_sz / 1024, + data.vrq_alloc_sz / 1024, + data.vrq_alloc_transient_sz / 1024, + data.vrq_limit / 1024); +} + +int +main(int argc, char *argv[]) +{ + char c; + const char *opt_a = NULL, *opt_r = NULL; + bool opt_q = false; + int fd; + + const char *pname = argv[0]; + + while ((c = getopt(argc, argv, "a:r:qh")) != -1) { + switch (c) { + case 'a': + opt_a = optarg; + break; + case 'r': + opt_r = optarg; + break; + case 'q': + opt_q = true; + break; + case 'h': + usage(pname); + return (EXIT_SUCCESS); + default: + usage(pname); + return (EXIT_FAILURE); + } + } + if (optind < argc || + (opt_a == NULL && opt_r == NULL && !opt_q) || + (opt_a != NULL && opt_r != NULL)) { + usage(pname); + return (EXIT_FAILURE); + } + + fd = open(VMM_CTL_DEV, O_EXCL | O_RDWR); + if (fd < 0) { + perror("Could not open vmmctl"); + usage(pname); + return (EXIT_FAILURE); + } + + if (opt_a != NULL) { + size_t sz; + + if (!parse_size(opt_a, &sz)) { + perror("Invalid size"); + usage(pname); + return (EXIT_FAILURE); + } + + do_add(fd, sz); + } + if (opt_r != NULL) { + size_t sz; + + if (!parse_size(opt_r, &sz)) { + perror("Invalid size"); + usage(pname); + return (EXIT_FAILURE); + } + do_remove(fd, sz); + } + if (opt_q) { + do_query(fd); + } + + (void) close(fd); + return (0); +} diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c index ba3fb7f8dd..ec27949a43 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.c +++ b/usr/src/lib/libvmmapi/common/vmmapi.c @@ -39,7 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2019 Joyent, Inc. - * Copyright 2020 Oxide Computer Company + * Copyright 2021 Oxide Computer Company */ #include <sys/cdefs.h> @@ -109,12 +109,31 @@ struct vmctx { #ifdef __FreeBSD__ #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) -#else -#define CREATE(x) vm_do_ctl(VMM_CREATE_VM, (x)) -#define DESTROY(x) vm_do_ctl(VMM_DESTROY_VM, (x)) +int +vm_create(const char *name) +{ + /* Try to load vmm(4) module before creating a guest. */ + if (modfind("vmm") < 0) + kldload("vmm"); + return (CREATE((char *)name)); +} + +void +vm_destroy(struct vmctx *vm) +{ + assert(vm != NULL); + + if (vm->fd >= 0) + close(vm->fd); + DESTROY(vm->name); + + free(vm); +} + +#else static int -vm_do_ctl(int cmd, const char *name) +vm_do_ctl(int cmd, void *req) { int ctl_fd; @@ -123,7 +142,7 @@ vm_do_ctl(int cmd, const char *name) return (-1); } - if (ioctl(ctl_fd, cmd, name) == -1) { + if (ioctl(ctl_fd, cmd, req) == -1) { int err = errno; /* Do not lose ioctl errno through the close(2) */ @@ -135,6 +154,46 @@ vm_do_ctl(int cmd, const char *name) return (0); } + +int +vm_create(const char *name, uint64_t flags) +{ + struct vm_create_req req; + + (void) strncpy(req.name, name, VM_MAX_NAMELEN); + req.flags = flags; + + return (vm_do_ctl(VMM_CREATE_VM, &req)); +} + +void +vm_close(struct vmctx *vm) +{ + assert(vm != NULL); + assert(vm->fd >= 0); + + (void) close(vm->fd); + + free(vm); +} + +void +vm_destroy(struct vmctx *vm) +{ + struct vm_destroy_req req; + + assert(vm != NULL); + + if (vm->fd >= 0) { + (void) close(vm->fd); + vm->fd = -1; + } + + (void) strncpy(req.name, vm->name, VM_MAX_NAMELEN); + (void) vm_do_ctl(VMM_DESTROY_VM, &req); + + free(vm); +} #endif static int @@ -155,17 +214,6 @@ vm_device_open(const char *name) return (fd); } -int -vm_create(const char *name) -{ -#ifdef __FreeBSD__ - /* Try to load vmm(4) module before creating a guest. */ - if (modfind("vmm") < 0) - kldload("vmm"); -#endif - return (CREATE((char *)name)); -} - struct vmctx * vm_open(const char *name) { @@ -189,30 +237,6 @@ err: return (NULL); } -#ifndef __FreeBSD__ -void -vm_close(struct vmctx *vm) -{ - assert(vm != NULL); - assert(vm->fd >= 0); - - (void) close(vm->fd); - - free(vm); -} -#endif - -void -vm_destroy(struct vmctx *vm) -{ - assert(vm != NULL); - - if (vm->fd >= 0) - close(vm->fd); - DESTROY(vm->name); - - free(vm); -} int vm_parse_memsize(const char *optarg, size_t *ret_memsize) diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h index 79c7dc02ee..e239b70a56 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.h +++ b/usr/src/lib/libvmmapi/common/vmmapi.h @@ -39,7 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2019 Joyent, Inc. - * Copyright 2020 Oxide Computer Company + * Copyright 2021 Oxide Computer Company */ #ifndef _VMMAPI_H_ @@ -134,7 +134,11 @@ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len); +#ifndef __FreeBSD__ +int vm_create(const char *name, uint64_t flags); +#else int vm_create(const char *name); +#endif /* __FreeBSD__ */ int vm_get_device_fd(struct vmctx *ctx); struct vmctx *vm_open(const char *name); #ifndef __FreeBSD__ diff --git a/usr/src/pkg/manifests/system-bhyve.mf b/usr/src/pkg/manifests/system-bhyve.mf index 0495d9f649..3f67fa743e 100644 --- a/usr/src/pkg/manifests/system-bhyve.mf +++ b/usr/src/pkg/manifests/system-bhyve.mf @@ -48,6 +48,7 @@ file path=usr/kernel/drv/$(ARCH64)/vmm file path=usr/kernel/drv/ppt.conf file path=usr/kernel/drv/viona.conf file path=usr/kernel/drv/vmm.conf +file path=usr/lib/rsrvrctl mode=0555 file path=usr/sbin/bhyve mode=0555 file path=usr/sbin/bhyvectl mode=0555 file path=usr/sbin/pptadm mode=0555 diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index 8747b96acc..5b98acd24f 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2021 Oxide Computer Company */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -340,13 +341,13 @@ struct as; * * So, as a quick summary: * - * pse_mutex[]'s protect the p_selock and p_cv fields. + * pse_mutex[]'s protect the p_selock and p_cv fields. * - * p_selock protects the p_free, p_age, p_vnode, p_offset and p_hash, + * p_selock protects the p_free, p_age, p_vnode, p_offset and p_hash, * - * ph_mutex[]'s protect the page_hash[] array and its chains. + * ph_mutex[]'s protect the page_hash[] array and its chains. * - * vph_mutex[]'s protect the v_pages field and the vp page chains. + * vph_mutex[]'s protect the v_pages field and the vp page chains. * * First lock the page, then the hash chain, then the vnode chain. When * this is not possible `trylocks' must be used. Sleeping while holding @@ -762,6 +763,7 @@ void page_lock_delete(page_t *); int page_deleted(page_t *); int page_pp_lock(page_t *, int, int); void page_pp_unlock(page_t *, int, int); +int page_xresv(pgcnt_t, uint_t, int (*)(void)); int page_resv(pgcnt_t, uint_t); void page_unresv(pgcnt_t); void page_pp_useclaim(page_t *, page_t *, uint_t); @@ -1078,7 +1080,7 @@ typedef struct kpm_hlk { * The state about how a kpm page is mapped and whether it is ready to go * is indicated by the following 1 byte kpm_spage structure. This byte is * split into two 4-bit parts - kp_mapped and kp_mapped_go. - * - kp_mapped == 1 the page is mapped cacheable + * - kp_mapped == 1 the page is mapped cacheable * - kp_mapped == 2 the page is mapped non-cacheable * - kp_mapped_go == 1 the mapping is ready to be dropped in * - kp_mapped_go == 0 the mapping is not ready to be dropped in. diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index bcc6d05d47..89751b7b2c 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -23,6 +23,7 @@ * Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net> * Copyright (c) 2015, 2016 by Delphix. All rights reserved. * Copyright 2018 Joyent, Inc. + * Copyright 2021 Oxide Computer Company */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -3919,29 +3920,68 @@ page_pp_unlock( } /* - * This routine reserves availrmem for npages; - * flags: KM_NOSLEEP or KM_SLEEP - * returns 1 on success or 0 on failure + * This routine reserves availrmem for npages. + * It returns 1 on success or 0 on failure. + * + * flags: KM_NOSLEEP or KM_SLEEP + * cb_wait: called to induce delay when KM_SLEEP reservation requires kmem + * reaping to potentially succeed. If the callback returns 0, the + * reservation attempts will cease to repeat and page_xresv() may + * report a failure. If cb_wait is NULL, the traditional delay(hz/2) + * behavior will be used while waiting for a reap. */ int -page_resv(pgcnt_t npages, uint_t flags) +page_xresv(pgcnt_t npages, uint_t flags, int (*cb_wait)(void)) { mutex_enter(&freemem_lock); - while (availrmem < tune.t_minarmem + npages) { - if (flags & KM_NOSLEEP) { - mutex_exit(&freemem_lock); - return (0); - } + if (availrmem >= tune.t_minarmem + npages) { + availrmem -= npages; mutex_exit(&freemem_lock); - page_needfree(npages); - kmem_reap(); - delay(hz >> 2); - page_needfree(-(spgcnt_t)npages); - mutex_enter(&freemem_lock); + return (1); + } else if ((flags & KM_NOSLEEP) != 0) { + mutex_exit(&freemem_lock); + return (0); } - availrmem -= npages; mutex_exit(&freemem_lock); - return (1); + + /* + * We signal memory pressure to the system by elevating 'needfree'. + * Processes such as kmem reaping, pageout, and ZFS ARC shrinking can + * then respond to said pressure by freeing pages. + */ + page_needfree(npages); + int nobail = 1; + do { + kmem_reap(); + if (cb_wait == NULL) { + delay(hz >> 2); + } else { + nobail = cb_wait(); + } + + mutex_enter(&freemem_lock); + if (availrmem >= tune.t_minarmem + npages) { + availrmem -= npages; + mutex_exit(&freemem_lock); + page_needfree(-(spgcnt_t)npages); + return (1); + } + mutex_exit(&freemem_lock); + } while (nobail != 0); + page_needfree(-(spgcnt_t)npages); + + return (0); +} + +/* + * This routine reserves availrmem for npages; + * flags: KM_NOSLEEP or KM_SLEEP + * returns 1 on success or 0 on failure + */ +int +page_resv(pgcnt_t npages, uint_t flags) +{ + return (page_xresv(npages, flags, NULL)); } /* diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 558d19ad3f..0a3fad877c 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -117,7 +117,6 @@ CORE_OBJS += \ ppage.o \ pwrnow.o \ smt.o \ - seg_vmm.o \ speedstep.o \ ssp.o \ startup.o \ @@ -270,6 +269,8 @@ VMM_OBJS += vmm.o \ svm_support.o \ amdv.o \ vmm_gpt.o \ + seg_vmm.o \ + vmm_reservoir.o \ vmm_sol_vm.o \ vmm_sol_glue.o \ vmm_sol_ept.o \ diff --git a/usr/src/uts/i86pc/vm/seg_vmm.c b/usr/src/uts/i86pc/io/vmm/seg_vmm.c index beb5e81d53..23a8da3bc5 100644 --- a/usr/src/uts/i86pc/vm/seg_vmm.c +++ b/usr/src/uts/i86pc/io/vmm/seg_vmm.c @@ -11,6 +11,7 @@ /* * Copyright 2018 Joyent, Inc. + * Copyright 2021 Oxide Computer Company */ /* @@ -40,7 +41,16 @@ #include <vm/as.h> #include <vm/seg.h> #include <vm/seg_kmem.h> -#include <vm/seg_vmm.h> + +#include <sys/seg_vmm.h> + +typedef struct segvmm_data { + krwlock_t svmd_lock; + vm_object_t svmd_obj; + uintptr_t svmd_obj_off; + uchar_t svmd_prot; + size_t svmd_softlockcnt; +} segvmm_data_t; static int segvmm_dup(struct seg *, struct seg *); @@ -105,31 +115,14 @@ segvmm_create(struct seg **segpp, void *argsp) segvmm_crargs_t *cra = argsp; segvmm_data_t *data; - /* - * Check several aspects of the mapping request to ensure validity: - * - kernel pages must reside entirely in kernel space - * - target protection must be user-accessible - * - kernel address must be page-aligned - */ - if ((uintptr_t)cra->kaddr <= _userlimit || - ((uintptr_t)cra->kaddr + seg->s_size) < (uintptr_t)cra->kaddr || - (cra->prot & PROT_USER) == 0 || - ((uintptr_t)cra->kaddr & PAGEOFFSET) != 0) { - return (EINVAL); - } - data = kmem_zalloc(sizeof (*data), KM_SLEEP); rw_init(&data->svmd_lock, NULL, RW_DEFAULT, NULL); - data->svmd_kaddr = (uintptr_t)cra->kaddr; + data->svmd_obj = cra->obj; + data->svmd_obj_off = cra->offset; data->svmd_prot = cra->prot; - data->svmd_cookie = cra->cookie; - data->svmd_hold = cra->hold; - data->svmd_rele = cra->rele; - /* Since initial checks have passed, grab a reference on the cookie */ - if (data->svmd_hold != NULL) { - data->svmd_hold(data->svmd_cookie); - } + /* Grab a hold on the VM object for the duration of this seg mapping */ + vm_object_reference(data->svmd_obj); seg->s_ops = &segvmm_ops; seg->s_data = data; @@ -146,16 +139,12 @@ segvmm_dup(struct seg *seg, struct seg *newseg) newsvmd = kmem_zalloc(sizeof (segvmm_data_t), KM_SLEEP); rw_init(&newsvmd->svmd_lock, NULL, RW_DEFAULT, NULL); - newsvmd->svmd_kaddr = svmd->svmd_kaddr; + newsvmd->svmd_obj = svmd->svmd_obj; + newsvmd->svmd_obj_off = svmd->svmd_obj_off; newsvmd->svmd_prot = svmd->svmd_prot; - newsvmd->svmd_cookie = svmd->svmd_cookie; - newsvmd->svmd_hold = svmd->svmd_hold; - newsvmd->svmd_rele = svmd->svmd_rele; /* Grab another hold for the duplicate segment */ - if (svmd->svmd_hold != NULL) { - newsvmd->svmd_hold(newsvmd->svmd_cookie); - } + vm_object_reference(svmd->svmd_obj); newseg->s_ops = seg->s_ops; newseg->s_data = newsvmd; @@ -180,10 +169,8 @@ segvmm_unmap(struct seg *seg, caddr_t addr, size_t len) /* Unconditionally unload the entire segment range. */ hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP); - /* Release the hold this segment possessed */ - if (svmd->svmd_rele != NULL) { - svmd->svmd_rele(svmd->svmd_cookie); - } + /* Release the VM object hold this segment possessed */ + vm_object_deallocate(svmd->svmd_obj); seg_free(seg); return (0); @@ -206,41 +193,23 @@ static int segvmm_fault_in(struct hat *hat, struct seg *seg, uintptr_t va, size_t len) { segvmm_data_t *svmd = seg->s_data; - const uintptr_t koff = svmd->svmd_kaddr - (uintptr_t)seg->s_base; const uintptr_t end = va + len; const uintptr_t prot = svmd->svmd_prot; - /* Stick to the simple non-large-page case for now */ va &= PAGEMASK; - + uintptr_t off = va - (uintptr_t)seg->s_base; do { - htable_t *ht; - uint_t entry, lvl; - size_t psz; pfn_t pfn; - const uintptr_t kaddr = va + koff; - - ASSERT(kaddr >= (uintptr_t)svmd->svmd_kaddr); - ASSERT(kaddr < ((uintptr_t)svmd->svmd_kaddr + seg->s_size)); - ht = htable_getpage(kas.a_hat, kaddr, &entry); - if (ht == NULL) { - return (-1); - } - lvl = ht->ht_level; - pfn = PTE2PFN(x86pte_get(ht, entry), lvl); - htable_release(ht); + pfn = vm_object_pfn(svmd->svmd_obj, off); if (pfn == PFN_INVALID) { return (-1); } - /* For the time being, handling for large pages is absent. */ - psz = PAGESIZE; - pfn += mmu_btop(kaddr & LEVEL_OFFSET(lvl)); - - hat_devload(hat, (caddr_t)va, psz, pfn, prot, HAT_LOAD); - - va = va + psz; + /* Ignore any large-page possibilities for now */ + hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, prot, HAT_LOAD); + va += PAGESIZE; + off += PAGESIZE; } while (va < end); return (0); @@ -399,8 +368,8 @@ static int segvmm_gettype(struct seg *seg, caddr_t addr) { /* - * Since already-existing kernel pages are being mapped into userspace, - * always report the segment type as shared. + * Since already-existing vmm reservoir pages are being mapped into + * userspace, always report the segment type as shared. */ return (MAP_SHARED); } @@ -457,8 +426,8 @@ segvmm_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) { segvmm_data_t *svmd = seg->s_data; - memidp->val[0] = (uintptr_t)svmd->svmd_kaddr; - memidp->val[1] = (uintptr_t)(addr - seg->s_base); + memidp->val[0] = (uintptr_t)svmd->svmd_obj; + memidp->val[1] = (uintptr_t)(addr - seg->s_base) + svmd->svmd_obj_off; return (0); } diff --git a/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h b/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h new file mode 100644 index 0000000000..a4f72f816e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/sys/seg_vmm.h @@ -0,0 +1,30 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VM_SEG_VMM_H +#define _VM_SEG_VMM_H + +#include <sys/vmm_vm.h> + +typedef struct segvmm_crargs { + uchar_t prot; /* protection */ + vm_object_t obj; + uintptr_t offset; +} segvmm_crargs_t; + +int segvmm_create(struct seg **, void *); + +#endif /* _VM_SEG_VMM_H */ diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_impl.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_impl.h index 606be4bbae..2b6f41ec54 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_impl.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_impl.h @@ -40,7 +40,7 @@ struct vmm_devmem_entry { list_node_t vde_node; int vde_segid; - char vde_name[SPECNAMELEN + 1]; + char vde_name[VM_MAX_SEG_NAMELEN]; size_t vde_len; off_t vde_off; }; diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h index 8441b51e03..4191aaee5c 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h @@ -115,7 +115,7 @@ struct vmm_ops { extern struct vmm_ops vmm_ops_intel; extern struct vmm_ops vmm_ops_amd; -int vm_create(const char *name, struct vm **retvm); +int vm_create(const char *name, uint64_t flags, struct vm **retvm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_reservoir.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_reservoir.h new file mode 100644 index 0000000000..b8215ce654 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_reservoir.h @@ -0,0 +1,40 @@ + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _SYS_VMM_RESERVOIR_H +#define _SYS_VMM_RESERVOIR_H + +#include <sys/types.h> +#include <sys/cred.h> + +struct vmmr_region; +typedef struct vmmr_region vmmr_region_t; + +void vmmr_init(); +void vmmr_fini(); +bool vmmr_is_empty(); + +int vmmr_alloc(size_t, bool, vmmr_region_t **); +void *vmmr_region_mem_at(vmmr_region_t *, uintptr_t); +pfn_t vmmr_region_pfn_at(vmmr_region_t *, uintptr_t); +void vmmr_free(vmmr_region_t *); + +int vmmr_add(size_t, bool); +int vmmr_remove(size_t, bool); + +int vmmr_ioctl(int, intptr_t, int, cred_t *, int *); + +#endif /* _SYS_VMM_RESERVOIR_H */ diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h index 6c7f9d423e..76d5fec8b7 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_vm.h @@ -160,8 +160,6 @@ int vm_segmap_obj(vm_object_t, off_t, size_t, struct as *, caddr_t *, uint_t, int vm_segmap_space(struct vmspace *, off_t, struct as *, caddr_t *, off_t, uint_t, uint_t, uint_t); void *vmspace_find_kva(struct vmspace *, uintptr_t, size_t); -void vmm_arena_init(void); -void vmm_arena_fini(void); typedef int (*pmap_pinit_t)(struct pmap *pmap); @@ -171,13 +169,12 @@ void vmspace_free(struct vmspace *); int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count); -void vmm_arena_fini(void); - -struct vm_object *vm_object_allocate(objtype_t, vm_pindex_t); +struct vm_object *vm_object_allocate(objtype_t, vm_pindex_t, bool); void vm_object_deallocate(vm_object_t); void vm_object_reference(vm_object_t); int vm_object_set_memattr(vm_object_t, vm_memattr_t); +pfn_t vm_object_pfn(vm_object_t, uintptr_t); #define VM_OBJECT_WLOCK(vmo) mutex_enter(&(vmo)->vmo_lock) #define VM_OBJECT_WUNLOCK(vmo) mutex_exit(&(vmo)->vmo_lock) diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c index eecff88b7d..80c9ec6bd7 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$"); #include <sys/sched.h> #include <sys/smp.h> #include <sys/systm.h> +#include <sys/sunddi.h> #include <machine/pcb.h> #include <machine/smp.h> @@ -191,6 +192,8 @@ struct vm { uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ struct ioport_config ioports; /* (o) ioport handling */ + + bool mem_transient; /* (o) alloc transient memory */ }; static int vmm_initialized; @@ -490,7 +493,7 @@ uint_t cores_per_package = 1; uint_t threads_per_core = 1; int -vm_create(const char *name, struct vm **retvm) +vm_create(const char *name, uint64_t flags, struct vm **retvm) { struct vm *vm; struct vmspace *vmspace; @@ -502,8 +505,8 @@ vm_create(const char *name, struct vm **retvm) if (!vmm_initialized) return (ENXIO); - if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) - return (EINVAL); + /* Name validation has already occurred */ + VERIFY3U(strnlen(name, VM_MAX_NAMELEN), <, VM_MAX_NAMELEN); vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); if (vmspace == NULL) @@ -512,6 +515,7 @@ vm_create(const char *name, struct vm **retvm) vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; + vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; vm->sockets = 1; vm->cores = cores_per_package; /* XXX backwards compatibility */ @@ -708,21 +712,12 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) struct mem_seg *seg; vm_object_t obj; -#ifndef __FreeBSD__ - extern pgcnt_t get_max_page_get(void); -#endif - if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); if (len == 0 || (len & PAGE_MASK)) return (EINVAL); -#ifndef __FreeBSD__ - if (len > ptob(get_max_page_get())) - return (EINVAL); -#endif - seg = &vm->mem_segs[ident]; if (seg->object != NULL) { if (seg->len == len && seg->sysmem == sysmem) @@ -731,7 +726,8 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) return (EINVAL); } - obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); + obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT, + vm->mem_transient); if (obj == NULL) return (ENOMEM); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_reservoir.c b/usr/src/uts/i86pc/io/vmm/vmm_reservoir.c new file mode 100644 index 0000000000..1bb64a4851 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_reservoir.c @@ -0,0 +1,820 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +/* + * VMM Memory Reservoir + * + * + * In order to make the allocation of large (multi-GiB) chunks of memory + * for bhyve VMs easier, we introduce the "VMM Reservoir", where system + * operators can set aside a substantial portion of system memory exclusively + * for VMs. This memory is unavailable for general use by the rest of the + * system. Rather than having to scour the freelist, reap kmem caches, or put + * pressure on the ARC, bhyve guest memory allocations can quickly determine if + * there is adequate reservoir memory available. Since the pages stored in the + * reservoir are pre-zeroed, it can be immediately used when allocated to a + * guest. When the memory is returned to the reservoir, it is zeroed once more + * to avoid leaking any sensitive data from that guest. + * + * + * Transient Allocations + * + * While the explicit reservoir model may work well for some applications, + * others may want a more traditional model, where pages for guest memory + * objects are allocated on demand, rather than from a pool set aside from the + * system. In this case, the allocation can be made in "transient" mode, where + * the memory is allocated normally, even if there is free capacity in the + * reservoir. When use of the transient allocation is complete (the guest is + * halted and destroyed), the pages will be freed back to the system, rather + * than added back to the reservoir. + * + * From an implementation standpoint, transient allocations follow the same + * code paths as ones using the reservoir normally. Those allocations have a + * tag which marks them as transient, and used/free size tallies are maintained + * separately for normal and transient operations. When performing a transient + * allocation, that amount of memory is immediately added to the reservoir , + * from which the allocation can be made. When freeing a transient allocation, + * a matching amount of memory is removed from the reservoir as part of the + * operation. This allows both allocation types to coexist without too much + * additional machinery. + * + * + * Administration + * + * Operators may increase, decrease, and query the the amount of memory + * allocated to the reservoir and from to VMs via ioctls against the vmmctl + * device. The total amount added to the reservoir is arbitrarily limited at + * this time by `vmmr_total_limit` which defaults to 80% of physmem. This is + * done to prevent the reservoir from inadvertently growing to a size where the + * system has inadequate memory to make forward progress. Memory may only be + * removed from the reservoir when it is free (not allocated by any guest VMs). + * + * + * Page Tracking + * + * The reservoir currently uses vnode association to keep track of pages under + * its control (either designated to the reservoir and free, or allocated to a + * guest VM object). This means using the existing VM system primitives for + * page_t instances being associated with a given (vnode, offset) tuple. It + * means that spans of pages, either free or allocated, need only to store a + * length (of the span) and an offset (into the vnode) in order to gain access + * to all of the underlying pages associated with that span. Associating the + * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be + * properly tracked as KAS pages, but be excluded from normal dumps (unless the + * operator has chosen to dump all of RAM). + */ + +#include <sys/types.h> +#include <sys/mutex.h> +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/machparam.h> +#include <sys/kmem.h> +#include <sys/stddef.h> +#include <sys/null.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/sunddi.h> +#include <sys/policy.h> +#include <vm/seg_kmem.h> +#include <vm/hat_i86.h> + +#include <sys/vmm_reservoir.h> +#include <sys/vmm_dev.h> + +static kmutex_t vmmr_lock; + +static size_t vmmr_free_sz; +static size_t vmmr_free_transient_sz; +static size_t vmmr_adding_sz; +static size_t vmmr_alloc_sz; +static size_t vmmr_alloc_transient_sz; +static size_t vmmr_empty_sz; + +static uintptr_t vmmr_empty_last; +/* Upper limit for the size (free + allocated) of the reservoir */ +static size_t vmmr_total_limit; + +/* VA range allocated from the VMM arena for the mappings */ +static uintptr_t vmmr_va; +static uintptr_t vmmr_va_sz; + +/* Pair of AVL trees to store set of spans ordered by addr and size */ +typedef struct vmmr_treepair { + avl_tree_t by_addr; + avl_tree_t by_size; +} vmmr_treepair_t; + +/* Spans of free memory in the reservoir */ +static vmmr_treepair_t vmmr_free_tp; + +/* Spans of empty (not backed by memory) space in the reservoir */ +static vmmr_treepair_t vmmr_empty_tp; + +/* Regions of memory allocated from the reservoir */ +static list_t vmmr_alloc_regions; + +struct vmmr_span { + uintptr_t vs_addr; + size_t vs_size; + avl_node_t vs_by_addr; + avl_node_t vs_by_size; + uintptr_t vs_region_addr; +}; +typedef struct vmmr_span vmmr_span_t; + +struct vmmr_region { + size_t vr_size; + avl_tree_t vr_spans; + list_node_t vr_node; + bool vr_transient; +}; + +static int +vmmr_cmp_addr(const void *a, const void *b) +{ + const vmmr_span_t *sa = a; + const vmmr_span_t *sb = b; + + if (sa->vs_addr == sb->vs_addr) { + return (0); + } else if (sa->vs_addr < sb->vs_addr) { + return (-1); + } else { + return (1); + } +} + +static int +vmmr_cmp_size(const void *a, const void *b) +{ + const vmmr_span_t *sa = a; + const vmmr_span_t *sb = b; + + if (sa->vs_size == sb->vs_size) { + /* + * Since discontiguous spans could have the same size in a + * by-size tree, differentiate them (as required by AVL) by + * address so they can safely coexist while remaining sorted. + */ + return (vmmr_cmp_addr(a, b)); + } else if (sa->vs_size < sb->vs_size) { + return (-1); + } else { + return (1); + } +} + +static int +vmmr_cmp_region_addr(const void *a, const void *b) +{ + const vmmr_span_t *sa = a; + const vmmr_span_t *sb = b; + + if (sa->vs_region_addr == sb->vs_region_addr) { + return (0); + } else if (sa->vs_region_addr < sb->vs_region_addr) { + return (-1); + } else { + return (1); + } +} + +static void +vmmr_tp_init(vmmr_treepair_t *tree) +{ + avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t), + offsetof(vmmr_span_t, vs_by_addr)); + avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t), + offsetof(vmmr_span_t, vs_by_size)); +} + +static void +vmmr_tp_destroy(vmmr_treepair_t *tree) +{ + void *vcp = NULL; + vmmr_span_t *span; + + while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) { + /* Freeing spans will be done when tearing down by-size tree */ + } + while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) { + kmem_free(span, sizeof (*span)); + } + avl_destroy(&tree->by_addr); + avl_destroy(&tree->by_size); +} + +/* + * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent + * span(s). Such concatenation could result in the `to_add` span being freed, + * so the caller cannot use it after this returns. + */ +static void +vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree) +{ + avl_tree_t *by_addr = &tree->by_addr; + avl_tree_t *by_size = &tree->by_size; + vmmr_span_t *node; + avl_index_t where; + + /* This addr should not already exist in the treepair */ + node = avl_find(by_addr, to_add, &where); + ASSERT3P(node, ==, NULL); + + node = avl_nearest(by_addr, where, AVL_BEFORE); + if (node != NULL && + (node->vs_addr + node->vs_size) == to_add->vs_addr) { + /* concat with preceeding item */ + avl_remove(by_addr, node); + avl_remove(by_size, node); + node->vs_size += to_add->vs_size; + kmem_free(to_add, sizeof (*to_add)); + + /* + * Since this now-concatenated span could be adjacent one + * trailing it, fall through to perform that check. + */ + to_add = node; + } + + node = avl_nearest(by_addr, where, AVL_AFTER); + if (node != NULL && + (to_add->vs_addr + to_add->vs_size) == node->vs_addr) { + /* concat with trailing item */ + avl_remove(by_addr, node); + avl_remove(by_size, node); + node->vs_addr = to_add->vs_addr; + node->vs_size += to_add->vs_size; + avl_add(by_addr, node); + avl_add(by_size, node); + + kmem_free(to_add, sizeof (*to_add)); + return; + } + + /* simply insert */ + avl_add(by_addr, to_add); + avl_add(by_size, to_add); +} + +/* + * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of + * the exact target size is not present, but a larger one is. May return a span + * with a size smaller than the target if splitting is not an option. + */ +static vmmr_span_t * +vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree) +{ + avl_tree_t *by_addr = &tree->by_addr; + avl_tree_t *by_size = &tree->by_size; + vmmr_span_t *span; + avl_index_t where; + + ASSERT3U(target_sz, !=, 0); + ASSERT(!avl_is_empty(by_addr)); + ASSERT(!avl_is_empty(by_size)); + + vmmr_span_t search = { .vs_size = target_sz }; + span = avl_find(by_size, &search, &where); + if (span == NULL) { + /* Try for a larger span (instead of exact match) */ + span = avl_nearest(by_size, where, AVL_AFTER); + if (span == NULL) { + /* + * Caller will need to collect several smaller spans in + * order to fulfill their request. + */ + span = avl_nearest(by_size, where, AVL_BEFORE); + ASSERT3P(span, !=, NULL); + } + } + + if (span->vs_size <= target_sz) { + avl_remove(by_size, span); + avl_remove(by_addr, span); + + return (span); + } else { + /* Split off adequate chunk from larger span */ + uintptr_t start = span->vs_addr + span->vs_size - target_sz; + + avl_remove(by_size, span); + span->vs_size -= target_sz; + avl_add(by_size, span); + + vmmr_span_t *split_span = + kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); + split_span->vs_addr = start; + split_span->vs_size = target_sz; + + return (split_span); + } +} + +void +vmmr_init() +{ + mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * `vmm_total_limit` represents the absolute maximum size of the VMM + * memory reservoir. It is meant to provide some measure of protection + * against an operator pushing the system into unrecoverable memory + * starvation through explicit or transient additions to the reservoir. + * + * There will be many situations where this limit would be inadequate to + * prevent kernel memory starvation in the face of certain operator + * actions. It is a balance to be struck between safety and allowing + * large systems to reach high utilization. + * + * The value is based off of pages_pp_maximum: "Number of currently + * available pages that cannot be 'locked'". It is sized as all of + * `physmem` less 120% of `pages_pp_maximum`. + */ + vmmr_total_limit = + (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10; + + vmmr_empty_last = 0; + vmmr_free_sz = 0; + vmmr_alloc_sz = 0; + vmmr_empty_sz = 0; + vmmr_adding_sz = 0; + vmmr_free_transient_sz = 0; + vmmr_alloc_transient_sz = 0; + + vmmr_tp_init(&vmmr_free_tp); + vmmr_tp_init(&vmmr_empty_tp); + + list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t), + offsetof(vmmr_region_t, vr_node)); + + /* Grab a chunk of VA for the reservoir */ + vmmr_va_sz = physmem * PAGESIZE; + vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP); +} + +void +vmmr_fini() +{ + mutex_enter(&vmmr_lock); + VERIFY3U(vmmr_alloc_sz, ==, 0); + VERIFY3U(vmmr_free_sz, ==, 0); + VERIFY3U(vmmr_adding_sz, ==, 0); + VERIFY3U(vmmr_alloc_transient_sz, ==, 0); + VERIFY3U(vmmr_free_transient_sz, ==, 0); + VERIFY(avl_is_empty(&vmmr_free_tp.by_addr)); + VERIFY(avl_is_empty(&vmmr_free_tp.by_size)); + VERIFY(list_is_empty(&vmmr_alloc_regions)); + + vmmr_tp_destroy(&vmmr_free_tp); + vmmr_tp_destroy(&vmmr_empty_tp); + list_destroy(&vmmr_alloc_regions); + + /* Release reservoir VA chunk */ + vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz); + vmmr_va = 0; + vmmr_va_sz = 0; + vmmr_total_limit = 0; + vmmr_empty_last = 0; + + mutex_exit(&vmmr_lock); + mutex_destroy(&vmmr_lock); +} + +bool +vmmr_is_empty() +{ + mutex_enter(&vmmr_lock); + bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 && + vmmr_free_sz == 0 && vmmr_free_transient_sz == 0); + mutex_exit(&vmmr_lock); + return (res); +} + +int +vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp) +{ + VERIFY3U(sz & PAGEOFFSET, ==, 0); + + if (!transient) { + mutex_enter(&vmmr_lock); + if (sz > vmmr_free_sz) { + mutex_exit(&vmmr_lock); + return (ENOSPC); + } + } else { + int err; + + err = vmmr_add(sz, true); + if (err != 0) { + return (err); + } + mutex_enter(&vmmr_lock); + VERIFY3U(vmmr_free_transient_sz, >=, sz); + } + + vmmr_region_t *region; + region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP); + avl_create(®ion->vr_spans, vmmr_cmp_region_addr, + sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr)); + region->vr_size = sz; + + size_t remain = sz; + uintptr_t map_at = 0; + while (remain > 0) { + vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); + + /* + * We have already ensured that adequate free memory is present + * in the reservoir for this allocation. + */ + VERIFY3P(span, !=, NULL); + ASSERT3U(span->vs_size, <=, remain); + + span->vs_region_addr = map_at; + avl_add(®ion->vr_spans, span); + map_at += span->vs_size; + remain -= span->vs_size; + } + + if (!transient) { + vmmr_free_sz -= sz; + vmmr_alloc_sz += sz; + } else { + vmmr_free_transient_sz -= sz; + vmmr_alloc_transient_sz += sz; + region->vr_transient = true; + } + list_insert_tail(&vmmr_alloc_regions, region); + mutex_exit(&vmmr_lock); + + *resp = region; + return (0); +} + +void * +vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off) +{ + /* just use KPM region for now */ + return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off))); +} + +pfn_t +vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off) +{ + VERIFY3U(off & PAGEOFFSET, ==, 0); + VERIFY3U(off, <, region->vr_size); + + vmmr_span_t search = { + .vs_region_addr = off + }; + avl_index_t where; + vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where); + + if (span == NULL) { + span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE); + ASSERT3P(span, !=, NULL); + } + uintptr_t span_off = off - span->vs_region_addr + span->vs_addr; + page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off); + VERIFY(pp != NULL); + return (pp->p_pagenum); +} + +void +vmmr_free(vmmr_region_t *region) +{ + mutex_enter(&vmmr_lock); + if (!region->vr_transient) { + VERIFY3U(region->vr_size, <=, vmmr_alloc_sz); + } else { + VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz); + } + list_remove(&vmmr_alloc_regions, region); + mutex_exit(&vmmr_lock); + + /* Zero the contents */ + for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) { + bzero(vmmr_region_mem_at(region, off), PAGESIZE); + } + + mutex_enter(&vmmr_lock); + + /* Put the contained span(s) back in the free pool */ + void *cookie = NULL; + vmmr_span_t *span; + while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) { + span->vs_region_addr = 0; + vmmr_tp_insert_concat(span, &vmmr_free_tp); + } + avl_destroy(®ion->vr_spans); + if (!region->vr_transient) { + vmmr_free_sz += region->vr_size; + vmmr_alloc_sz -= region->vr_size; + } else { + vmmr_free_transient_sz += region->vr_size; + vmmr_alloc_transient_sz -= region->vr_size; + } + mutex_exit(&vmmr_lock); + + if (region->vr_transient) { + vmmr_remove(region->vr_size, true); + } + kmem_free(region, sizeof (*region)); +} + +static void +vmmr_destroy_pages(vmmr_span_t *span) +{ + const uintptr_t end = span->vs_addr + span->vs_size; + struct vnode *vp = &kvps[KV_VVP]; + for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { + page_t *pp; + + /* Page-free logic cribbed from segkmem_xfree(): */ + pp = page_find(vp, (u_offset_t)pos); + VERIFY(pp != NULL); + if (!page_tryupgrade(pp)) { + /* + * Some other thread has a sharelock. Wait for + * it to drop the lock so we can free this page. + */ + page_unlock(pp); + pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL); + } + + /* + * Clear p_lckcnt so page_destroy() doesn't update availrmem. + * That will be taken care of later via page_unresv(). + */ + pp->p_lckcnt = 0; + page_destroy(pp, 0); + } +} + +static int +vmmr_alloc_pages(const vmmr_span_t *span) +{ + struct seg kseg = { + .s_as = &kas + }; + struct vnode *vp = &kvps[KV_VVP]; + + const uintptr_t end = span->vs_addr + span->vs_size; + for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { + page_t *pp; + + pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE, + PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos)); + + if (pp == NULL) { + /* Destroy any already-created pages */ + if (pos != span->vs_addr) { + vmmr_span_t destroy_span = { + .vs_addr = span->vs_addr, + .vs_size = pos - span->vs_addr, + }; + + vmmr_destroy_pages(&destroy_span); + } + return (ENOMEM); + } + + /* mimic page state from segkmem */ + ASSERT(PAGE_EXCL(pp)); + page_io_unlock(pp); + pp->p_lckcnt = 1; + page_downgrade(pp); + + /* pre-zero the page */ + bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE); + } + + return (0); +} + +static int +vmmr_resv_wait() +{ + if (delay_sig(hz >> 2) != 0) { + /* bail due to interruption */ + return (0); + } + return (1); +} + +static void +vmmr_remove_raw(size_t sz) +{ + VERIFY3U(sz & PAGEOFFSET, ==, 0); + VERIFY(MUTEX_HELD(&vmmr_lock)); + + size_t remain = sz; + while (remain > 0) { + vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); + + /* + * The caller must ensure that at least `sz` amount is present + * in the free treepair. + */ + VERIFY3P(span, !=, NULL); + ASSERT3U(span->vs_size, <=, remain); + + /* TODO: perhaps arrange to destroy pages outside the lock? */ + vmmr_destroy_pages(span); + + remain -= span->vs_size; + vmmr_tp_insert_concat(span, &vmmr_empty_tp); + } + + vmmr_empty_sz += sz; +} + +int +vmmr_add(size_t sz, bool transient) +{ + VERIFY3U(sz & PAGEOFFSET, ==, 0); + + mutex_enter(&vmmr_lock); + /* + * Make sure that the amount added is not going to breach the limits + * we've chosen + */ + const size_t current_total = + vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz + + vmmr_alloc_transient_sz + vmmr_free_transient_sz; + if ((current_total + sz) < current_total) { + mutex_exit(&vmmr_lock); + return (EOVERFLOW); + } + if ((current_total + sz) > vmmr_total_limit) { + mutex_exit(&vmmr_lock); + return (ENOSPC); + } + vmmr_adding_sz += sz; + mutex_exit(&vmmr_lock); + + /* Wait for enough pages to become available */ + if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) { + mutex_enter(&vmmr_lock); + vmmr_adding_sz -= sz; + mutex_exit(&vmmr_lock); + + return (EINTR); + } + + mutex_enter(&vmmr_lock); + size_t added = 0; + size_t remain = sz; + while (added < sz) { + vmmr_span_t *span = NULL; + + if (vmmr_empty_sz > 0) { + span = vmmr_tp_remove_split(remain, &vmmr_empty_tp); + + vmmr_empty_sz -= span->vs_size; + } else { + /* + * No empty space to fill with new pages, so just tack + * it on at the end instead. + */ + span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); + span->vs_addr = vmmr_empty_last; + span->vs_size = remain; + vmmr_empty_last += remain; + } + VERIFY3P(span, !=, NULL); + + + /* Allocate the actual pages to back this span */ + mutex_exit(&vmmr_lock); + int err = vmmr_alloc_pages(span); + mutex_enter(&vmmr_lock); + + /* + * If an error is encountered during page allocation for the + * span, unwind any progress made by the addition request. + */ + if (err != 0) { + /* + * Without pages allocated to this span, it is now + * tracked as empty. + */ + vmmr_empty_sz += span->vs_size; + vmmr_tp_insert_concat(span, &vmmr_empty_tp); + + if (added != 0) { + vmmr_remove_raw(added); + } + + vmmr_adding_sz -= sz; + mutex_exit(&vmmr_lock); + + page_unresv(sz >> PAGESHIFT); + return (err); + } + + /* + * The allocated-page-bearing span is placed in the "free" + * treepair now, but is not officially exposed for consumption + * until `vmm_free_sz` or `vmm_free_transient_sz` are updated. + * + * This allows us to unwind the allocation in case of a failure + * without the risk of the freshly added span(s) being snapped + * up by a consumer already. + */ + added += span->vs_size; + remain -= span->vs_size; + vmmr_tp_insert_concat(span, &vmmr_free_tp); + } + + /* Make the added memory usable by exposing it to the size accounting */ + if (!transient) { + vmmr_free_sz += added; + } else { + vmmr_free_transient_sz += added; + } + ASSERT3U(added, ==, sz); + vmmr_adding_sz -= added; + + mutex_exit(&vmmr_lock); + return (0); +} + +int +vmmr_remove(size_t sz, bool transient) +{ + VERIFY3U(sz & PAGEOFFSET, ==, 0); + + mutex_enter(&vmmr_lock); + if ((!transient && sz > vmmr_free_sz) || + (transient && sz > vmmr_free_transient_sz)) { + mutex_exit(&vmmr_lock); + return (ENOSPC); + } + + vmmr_remove_raw(sz); + + if (!transient) { + vmmr_free_sz -= sz; + } else { + vmmr_free_transient_sz -= sz; + } + mutex_exit(&vmmr_lock); + page_unresv(sz >> PAGESHIFT); + return (0); +} + +int +vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) +{ + switch (cmd) { + case VMM_RESV_QUERY: { + struct vmm_resv_query res; + void *datap = (void *)(uintptr_t)arg; + + /* For now, anyone in GZ can query */ + if (crgetzoneid(cr) != GLOBAL_ZONEID) { + return (EPERM); + } + mutex_enter(&vmmr_lock); + res.vrq_free_sz = vmmr_free_sz; + res.vrq_alloc_sz = vmmr_alloc_sz; + res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz; + res.vrq_limit = vmmr_total_limit; + mutex_exit(&vmmr_lock); + if (ddi_copyout(&res, datap, sizeof (res), md) != 0) { + return (EFAULT); + } + break; + } + case VMM_RESV_ADD: { + if (secpolicy_sys_config(cr, B_FALSE) != 0) { + return (EPERM); + } + return (vmmr_add((size_t)arg, false)); + } + case VMM_RESV_REMOVE: { + if (secpolicy_sys_config(cr, B_FALSE) != 0) { + return (EPERM); + } + return (vmmr_remove((size_t)arg, false)); + } + default: + return (ENOTTY); + } + return (0); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index d5f4b3883b..ef366ddaff 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -45,6 +45,7 @@ #include <sys/vmm_impl.h> #include <sys/vmm_drv.h> #include <sys/vmm_vm.h> +#include <sys/vmm_reservoir.h> #include <vm/seg_dev.h> @@ -1506,13 +1507,22 @@ vmm_hma_release(void) } static int -vmmdev_do_vm_create(char *name, cred_t *cr) +vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) { vmm_softc_t *sc = NULL; minor_t minor; int error = ENOMEM; + size_t len; + const char *name = req->name; - if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) { + len = strnlen(name, VM_MAX_NAMELEN); + if (len == 0) { + return (EINVAL); + } + if (len >= VM_MAX_NAMELEN) { + return (ENAMETOOLONG); + } + if (strchr(name, '/') != NULL) { return (EINVAL); } @@ -1555,7 +1565,7 @@ vmmdev_do_vm_create(char *name, cred_t *cr) goto fail; } - error = vm_create(name, &sc->vmm_vm); + error = vm_create(req->name, req->flags, &sc->vmm_vm); if (error == 0) { /* Complete VM intialization and report success. */ (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); @@ -1938,7 +1948,7 @@ vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd) /* ARGSUSED */ static int -vmmdev_do_vm_destroy(const char *name, cred_t *cr) +vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) { boolean_t hma_release = B_FALSE; vmm_softc_t *sc; @@ -1949,7 +1959,7 @@ vmmdev_do_vm_destroy(const char *name, cred_t *cr) mutex_enter(&vmm_mtx); - if ((sc = vmm_lookup(name)) == NULL) { + if ((sc = vmm_lookup(req->name)) == NULL) { mutex_exit(&vmm_mtx); return (ENOENT); } @@ -2193,6 +2203,47 @@ vmm_is_supported(intptr_t arg) } static int +vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) +{ + void *argp = (void *)arg; + + switch (cmd) { + case VMM_CREATE_VM: { + struct vm_create_req req; + + if ((md & FWRITE) == 0) { + return (EPERM); + } + if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { + return (EFAULT); + } + return (vmmdev_do_vm_create(&req, cr)); + } + case VMM_DESTROY_VM: { + struct vm_destroy_req req; + + if ((md & FWRITE) == 0) { + return (EPERM); + } + if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { + return (EFAULT); + } + return (vmmdev_do_vm_destroy(&req, cr)); + } + case VMM_VM_SUPPORTED: + return (vmm_is_supported(arg)); + case VMM_RESV_QUERY: + case VMM_RESV_ADD: + case VMM_RESV_REMOVE: + return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); + default: + break; + } + /* No other actions are legal on ctl device */ + return (ENOTTY); +} + +static int vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { @@ -2207,36 +2258,7 @@ vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, minor = getminor(dev); if (minor == VMM_CTL_MINOR) { - void *argp = (void *)arg; - char name[VM_MAX_NAMELEN] = { 0 }; - size_t len = 0; - - if ((mode & FKIOCTL) != 0) { - len = strlcpy(name, argp, sizeof (name)); - } else { - if (copyinstr(argp, name, sizeof (name), &len) != 0) { - return (EFAULT); - } - } - if (len >= VM_MAX_NAMELEN) { - return (ENAMETOOLONG); - } - - switch (cmd) { - case VMM_CREATE_VM: - if ((mode & FWRITE) == 0) - return (EPERM); - return (vmmdev_do_vm_create(name, credp)); - case VMM_DESTROY_VM: - if ((mode & FWRITE) == 0) - return (EPERM); - return (vmmdev_do_vm_destroy(name, credp)); - case VMM_VM_SUPPORTED: - return (vmm_is_supported(arg)); - default: - /* No other actions are legal on ctl device */ - return (ENOTTY); - } + return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); } sc = ddi_get_soft_state(vmm_statep, minor); @@ -2422,7 +2444,6 @@ vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } vmm_sol_glue_init(); - vmm_arena_init(); /* * Perform temporary HMA registration to determine if the system @@ -2462,7 +2483,6 @@ fail: if (reg != NULL) { hma_unregister(reg); } - vmm_arena_fini(); vmm_sol_glue_cleanup(); mutex_exit(&vmmdev_mtx); return (DDI_FAILURE); @@ -2494,6 +2514,11 @@ vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) } mutex_exit(&vmm_mtx); + if (!vmmr_is_empty()) { + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { mutex_exit(&vmmdev_mtx); @@ -2507,7 +2532,6 @@ vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) VERIFY0(vmm_mod_unload()); VERIFY3U(vmmdev_hma_reg, ==, NULL); - vmm_arena_fini(); vmm_sol_glue_cleanup(); mutex_exit(&vmmdev_mtx); @@ -2579,11 +2603,13 @@ _init(void) } vmm_zsd_init(); + vmmr_init(); error = mod_install(&modlinkage); if (error) { ddi_soft_state_fini(&vmm_statep); vmm_zsd_fini(); + vmmr_fini(); } return (error); @@ -2600,6 +2626,7 @@ _fini(void) } vmm_zsd_fini(); + vmmr_fini(); ddi_soft_state_fini(&vmm_statep); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c index 720af54200..bd1f1890d4 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c @@ -32,11 +32,12 @@ #include <vm/hat_i86.h> #include <vm/seg_vn.h> #include <vm/seg_kmem.h> -#include <vm/seg_vmm.h> #include <machine/vm.h> #include <sys/vmm_gpt.h> #include <sys/vmm_vm.h> +#include <sys/seg_vmm.h> +#include <sys/vmm_reservoir.h> #define PMAP_TO_VMMAP(pm) ((vm_map_t) \ ((caddr_t)(pm) - offsetof(struct vmspace, vms_pmap))) @@ -65,38 +66,6 @@ static vmspace_mapping_t *vm_mapping_find(struct vmspace *, uintptr_t, size_t, boolean_t); static void vm_mapping_remove(struct vmspace *, vmspace_mapping_t *); -static vmem_t *vmm_alloc_arena = NULL; - -static void * -vmm_arena_alloc(vmem_t *vmp, size_t size, int vmflag) -{ - return (segkmem_xalloc(vmp, NULL, size, vmflag, 0, - segkmem_page_create, &kvps[KV_VVP])); -} - -static void -vmm_arena_free(vmem_t *vmp, void *inaddr, size_t size) -{ - segkmem_xfree(vmp, inaddr, size, &kvps[KV_VVP], NULL); -} - -void -vmm_arena_init(void) -{ - vmm_alloc_arena = vmem_create("vmm_alloc_arena", NULL, 0, 1024 * 1024, - vmm_arena_alloc, vmm_arena_free, kvmm_arena, 0, VM_SLEEP); - - ASSERT(vmm_alloc_arena != NULL); -} - -void -vmm_arena_fini(void) -{ - VERIFY(vmem_size(vmm_alloc_arena, VMEM_ALLOC) == 0); - vmem_destroy(vmm_alloc_arena); - vmm_alloc_arena = NULL; -} - struct vmspace * vmspace_alloc(vm_offset_t start, vm_offset_t end, pmap_pinit_t pinit) { @@ -164,8 +133,9 @@ vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size) switch (vmo->vmo_type) { case OBJT_DEFAULT: - result = (void *)((uintptr_t)vmo->vmo_data + - VMSM_OFFSET(vmsm, addr)); + result = vmmr_region_mem_at( + (vmmr_region_t *)vmo->vmo_data, + VMSM_OFFSET(vmsm, addr) & PAGEMASK); break; default: break; @@ -344,39 +314,23 @@ vm_object_pager_none(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) } static pfn_t -vm_object_pager_heap(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) +vm_object_pager_reservoir(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, + uint_t *lvl) { - const uintptr_t kaddr = ALIGN2PAGE((uintptr_t)vmo->vmo_data + off); - uint_t idx, level; - htable_t *ht; - x86pte_t pte; - pfn_t top_pfn, pfn; + vmmr_region_t *region; + pfn_t pfn; ASSERT(vmo->vmo_type == OBJT_DEFAULT); - ASSERT(off < vmo->vmo_size); - ht = htable_getpage(kas.a_hat, kaddr, &idx); - if (ht == NULL) { - return (PFN_INVALID); - } - pte = x86pte_get(ht, idx); - if (!PTE_ISPAGE(pte, ht->ht_level)) { - htable_release(ht); - return (PFN_INVALID); - } - - pfn = top_pfn = PTE2PFN(pte, ht->ht_level); - level = ht->ht_level; - if (ht->ht_level > 0) { - pfn += mmu_btop(kaddr & LEVEL_OFFSET((uint_t)ht->ht_level)); - } - htable_release(ht); + region = vmo->vmo_data; + pfn = vmmr_region_pfn_at(region, off & PAGEMASK); + /* TODO: handle large pages */ if (lpfn != NULL) { - *lpfn = top_pfn; + *lpfn = pfn; } if (lvl != NULL) { - *lvl = level; + *lvl = 0; } return (pfn); } @@ -419,41 +373,8 @@ vm_object_pager_sg(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) return (pfn); } -static void -vm_reserve_pages(size_t npages) -{ - uint_t retries = 60; - int rc; - - mutex_enter(&freemem_lock); - if (availrmem < npages) { - mutex_exit(&freemem_lock); - - /* - * Set needfree and wait for the ZFS ARC reap thread to free up - * some memory. - */ - page_needfree(npages); - - mutex_enter(&freemem_lock); - while ((availrmem < npages) && retries-- > 0) { - mutex_exit(&freemem_lock); - rc = delay_sig(drv_usectohz(1 * MICROSEC)); - mutex_enter(&freemem_lock); - - if (rc == EINTR) - break; - } - mutex_exit(&freemem_lock); - - page_needfree(-npages); - } else { - mutex_exit(&freemem_lock); - } -} - vm_object_t -vm_object_allocate(objtype_t type, vm_pindex_t psize) +vm_object_allocate(objtype_t type, vm_pindex_t psize, bool transient) { vm_object_t vmo; const size_t size = ptob((size_t)psize); @@ -468,17 +389,19 @@ vm_object_allocate(objtype_t type, vm_pindex_t psize) switch (type) { case OBJT_DEFAULT: { - vm_reserve_pages(psize); - /* XXXJOY: opt-in to larger pages? */ - vmo->vmo_data = vmem_alloc(vmm_alloc_arena, size, KM_NOSLEEP); - if (vmo->vmo_data == NULL) { + /* TODO: opt-in to larger pages? */ + int err; + vmmr_region_t *region = NULL; + + err = vmmr_alloc(size, transient, ®ion); + if (err != 0) { mutex_destroy(&vmo->vmo_lock); kmem_free(vmo, sizeof (*vmo)); return (NULL); } - bzero(vmo->vmo_data, size); - vmo->vmo_pager = vm_object_pager_heap; + vmo->vmo_data = region; + vmo->vmo_pager = vm_object_pager_reservoir; } break; case OBJT_SG: @@ -505,7 +428,7 @@ vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, VERIFY(type == OBJT_SG); VERIFY(off == 0); - vmo = vm_object_allocate(type, size); + vmo = vm_object_allocate(type, size, false); vmo->vmo_data = sg; mutex_enter(&sg->sg_lock); @@ -529,7 +452,7 @@ vm_object_deallocate(vm_object_t vmo) switch (vmo->vmo_type) { case OBJT_DEFAULT: - vmem_free(vmm_alloc_arena, vmo->vmo_data, vmo->vmo_size); + vmmr_free((vmmr_region_t *)vmo->vmo_data); break; case OBJT_SG: sglist_free((struct sglist *)vmo->vmo_data); @@ -574,6 +497,17 @@ vm_object_reference(vm_object_t vmo) VERIFY3U(ref, !=, 0); } +pfn_t +vm_object_pfn(vm_object_t vmo, uintptr_t off) +{ + /* This is expected to be used only on reservoir-backed memory */ + if (vmo->vmo_type != OBJT_DEFAULT) { + return (PFN_INVALID); + } + + return (vmo->vmo_pager(vmo, off, NULL, NULL)); +} + static vmspace_mapping_t * vm_mapping_find(struct vmspace *vms, uintptr_t addr, size_t size, boolean_t no_lock) @@ -912,11 +846,9 @@ vm_segmap_obj(vm_object_t vmo, off_t map_off, size_t size, struct as *as, if (err == 0) { segvmm_crargs_t svma; - svma.kaddr = (caddr_t)vmo->vmo_data + map_off; + svma.obj = vmo; + svma.offset = map_off; svma.prot = prot; - svma.cookie = vmo; - svma.hold = (segvmm_holdfn_t)vm_object_reference; - svma.rele = (segvmm_relefn_t)vm_object_deallocate; err = as_map(as, *addrp, size, segvmm_create, &svma); } @@ -969,11 +901,9 @@ vm_segmap_space(struct vmspace *vms, off_t off, struct as *as, caddr_t *addrp, VERIFY(mapoff < vmo->vmo_size); VERIFY((mapoff + size) <= vmo->vmo_size); - svma.kaddr = (void *)((uintptr_t)vmo->vmo_data + mapoff); + svma.obj = vmo; + svma.offset = mapoff; svma.prot = prot; - svma.cookie = vmo; - svma.hold = (segvmm_holdfn_t)vm_object_reference; - svma.rele = (segvmm_relefn_t)vm_object_deallocate; err = as_map(as, *addrp, len, segvmm_create, &svma); } diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h index 5b3e7f9b10..e58d63761e 100644 --- a/usr/src/uts/i86pc/sys/vmm.h +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -39,7 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2019 Joyent, Inc. - * Copyright 2020 Oxide Computer Company + * Copyright 2021 Oxide Computer Company */ #ifndef _VMM_H_ @@ -124,20 +124,12 @@ enum x2apic_state { /* * illumos doesn't have a limitation based on SPECNAMELEN like FreeBSD does. - * Instead of picking an arbitrary value we will just rely on the same - * calculation that's made below. If this calculation ever changes we need to - * update the the VM_MAX_NAMELEN mapping in the bhyve brand's boot.c file. + * To simplify structure definitions, an arbitrary limit has been chosen. + * This same limit is used for memory segment names */ -#define VM_MAX_PREFIXLEN 10 -#define VM_MAX_SUFFIXLEN 15 -#define VM_MIN_NAMELEN 6 -#define VM_MAX_NAMELEN \ - (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) - -#ifdef _KERNEL -CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); -#endif +#define VM_MAX_NAMELEN 128 +#define VM_MAX_SEG_NAMELEN 128 #define VM_MAXCPU 32 /* maximum virtual cpus */ @@ -389,4 +381,12 @@ struct vm_entry { int vm_restart_instruction(void *vm, int vcpuid); +enum vm_create_flags { + /* + * Allocate guest memory segments from existing reservoir capacity, + * rather than attempting to create transient allocations. + */ + VCF_RESERVOIR_MEM = (1 << 0), +}; + #endif /* _VMM_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h index 15c64355c4..f371ad1266 100644 --- a/usr/src/uts/i86pc/sys/vmm_dev.h +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -39,6 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company */ #ifndef _VMM_DEV_H_ @@ -46,6 +47,19 @@ #include <machine/vmm.h> +#include <sys/param.h> +#include <sys/cpuset.h> + +struct vm_create_req { + char name[VM_MAX_NAMELEN]; + uint64_t flags; +}; + + +struct vm_destroy_req { + char name[VM_MAX_NAMELEN]; +}; + struct vm_memmap { vm_paddr_t gpa; int segid; /* memory segment */ @@ -66,7 +80,7 @@ struct vm_munmap { struct vm_memseg { int segid; size_t len; - char name[SPECNAMELEN + 1]; + char name[VM_MAX_SEG_NAMELEN]; }; struct vm_register { @@ -282,6 +296,13 @@ struct vm_run_state { uint8_t _pad[3]; }; +struct vmm_resv_query { + size_t vrq_free_sz; + size_t vrq_alloc_sz; + size_t vrq_alloc_transient_sz; + size_t vrq_limit; +}; + #define VMMCTL_IOC_BASE (('V' << 16) | ('M' << 8)) #define VMM_IOC_BASE (('v' << 16) | ('m' << 8)) #define VMM_LOCK_IOC_BASE (('v' << 16) | ('l' << 8)) @@ -292,6 +313,10 @@ struct vm_run_state { #define VMM_DESTROY_VM (VMMCTL_IOC_BASE | 0x02) #define VMM_VM_SUPPORTED (VMMCTL_IOC_BASE | 0x03) +#define VMM_RESV_QUERY (VMMCTL_IOC_BASE | 0x10) +#define VMM_RESV_ADD (VMMCTL_IOC_BASE | 0x11) +#define VMM_RESV_REMOVE (VMMCTL_IOC_BASE | 0x12) + /* Operations performed in the context of a given vCPU */ #define VM_RUN (VMM_CPU_IOC_BASE | 0x01) #define VM_SET_REGISTER (VMM_CPU_IOC_BASE | 0x02) diff --git a/usr/src/uts/i86pc/vm/seg_vmm.h b/usr/src/uts/i86pc/vm/seg_vmm.h deleted file mode 100644 index f5b95c6a27..0000000000 --- a/usr/src/uts/i86pc/vm/seg_vmm.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2018 Joyent, Inc. - */ - -#ifndef _VM_SEG_VMM_H -#define _VM_SEG_VMM_H - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct segvmm_crargs { - caddr_t kaddr; - uchar_t prot; /* protection */ - void *cookie; /* opaque resource backing memory */ - void (*hold)(void *); /* add reference to cookie */ - void (*rele)(void *); /* release reference to cookie */ -} segvmm_crargs_t; - -typedef void (*segvmm_holdfn_t)(void *); -typedef void (*segvmm_relefn_t)(void *); - -typedef struct segvmm_data { - krwlock_t svmd_lock; - uintptr_t svmd_kaddr; - uchar_t svmd_prot; - void *svmd_cookie; - segvmm_holdfn_t svmd_hold; - segvmm_relefn_t svmd_rele; - size_t svmd_softlockcnt; -} segvmm_data_t; - -extern int segvmm_create(struct seg **, void *); - -#ifdef __cplusplus -} -#endif - -#endif /* _VM_SEG_VMM_H */ |